Update documentation.

2025-01-30 11:34:13 +00:00 · 2012-06-09 01:06:25 -07:00 · 2012-06-09 01:06:25 -07:00 · ffd96a8c2f
commit ffd96a8c2f
parent 07f807d680
23 changed files with 3264 additions and 276 deletions
--- a/doc/cmdref.sgm
+++ b/doc/cmdref.sgm
@ -302,10 +302,10 @@
         </para>
 <programlisting>
 .macro store16   ; `store16 dest, src
-	lda #&lt;_2
-	sta _1
-	lda #&gt;_2
-	sta _1+1
+        lda #&lt;_2
+        sta _1
+        lda #&gt;_2
+        sta _1+1
 .macend
 </programlisting>
         <para>
@ -361,91 +361,202 @@
         follow.
       </para>
       <itemizedlist>
-       <listitem><para><literal>.advance</literal> <emphasis>address</emphasis>:
-      Forces the program counter to
-      be <emphasis>address</emphasis>. Unlike
-      the <literal>.org</literal>
-      directive, <literal>.advance</literal> outputs zeroes until the
-      program counter reaches a specified address. Attempting
-      to <literal>.advance</literal> to a point behind the current
-      program counter is an assemble-time error.</para></listitem>
-      <listitem><para><literal>.alias</literal> <emphasis>label</emphasis> <emphasis>value</emphasis>: The
-      .alias directive assigns an arbitrary value to a label. This
-      value may be an arbitrary argument, but cannot reference any
-      label that has not already been defined (this prevents
-      recursive label dependencies).</para></listitem>
-      <listitem><para><literal>.byte</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ... ]:
-      Specifies a series of arguments, which are evaluated, and
-      strings, which are included as raw ASCII data. The final
-      results of these arguments must be one byte in size. Seperate
-      constants are seperated by comments.</para></listitem>
-      <listitem><para><literal>.checkpc</literal> <emphasis>address</emphasis>: Ensures that the
-      program counter is less than or equal to the address
-      specified, and emits an assemble-time error if it is not.
-      <emphasis>This produces no code in the final binary - it is there to
-      ensure that linking a large amount of data together does not
-      overstep memory boundaries.</emphasis></para></listitem>
-      <listitem><para><literal>.data</literal> <emphasis>[label]</emphasis>: Sets the segment to
-      the segment name specified and disallows output. If no label
-      is given, switches to the default data segment.</para></listitem>
-      <listitem><para><literal>.incbin</literal> <emphasis>filename</emphasis>: Inserts the
-      contents of the file specified as binary data. Use it to
-      include graphics information, precompiled code, or other
-      non-assembler data.</para></listitem>
-      <listitem><para><literal>.include</literal> <emphasis>filename</emphasis>: Includes the
-      entirety of the file specified at that point in the program.
-      Use this to order your final sources.</para></listitem>
-      <listitem><para><literal>.org</literal> <emphasis>address</emphasis>: Sets the program
-      counter to the address specified. <emphasis>This does not emit any
-      code in and of itself, nor does it overwrite anything that
-      previously existed.</emphasis> If you wish to jump ahead in memory,
-      use <literal>.advance</literal>.</para></listitem>
-      <listitem><para><literal>.require</literal> <emphasis>filename</emphasis>: Includes the entirety
-      of the file specified at that point in the program.  Unlike <literal>.include</literal>,
-      however, code included with <literal>.require</literal> will only be inserted once.
-      The <literal>.require</literal> directive is useful for ensuring that certain code libraries
-      are somewhere in the final binary.  They are also very useful for guaranteeing that
-      macro libraries are available.</para></listitem>
-      <listitem><para><literal>.space</literal> <emphasis>label</emphasis> <emphasis>size</emphasis>: This
-      directive is used to organize global variables. It defines the
-      label specified to be at the current location of the program
-      counter, and then advances the program counter <emphasis>size</emphasis>
-      steps ahead. No actual code is produced. This is equivalent
-      to <literal>label: .org ^+size</literal>.</para></listitem>
-      <listitem><para><literal>.text</literal> <emphasis>[label]</emphasis>: Sets the segment to
-      the segment name specified and allows output. If no label is
-      given, switches to the default text segment.</para></listitem>
-      <listitem><para><literal>.word</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ... ]:
-      Like <literal>.byte</literal>, but values are all treated as two-byte
-      values and stored low-end first (as is the 6502's wont). Use
-      this to create jump tables (an unadorned label will evaluate
-      to that label's location) or otherwise store 16-bit
-      data.</para></listitem>
-      <listitem><para><literal>.dword</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
-      Like <literal>.word</literal>, but for 32-bit values.</para></listitem>
-      <listitem><para><literal>.wordbe</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
-      Like <literal>.word</literal>, but stores the value in a big-endian format (high byte first).</para></listitem>
-      <listitem><para><literal>.dwordbe</literal> <emphasis>arg</emphasis> [ , <emphasis>arg</emphasis>, ...]:
-      Like <literal>.dword</literal>, but stores the value high byte first.</para></listitem>
-      <listitem><para><literal>.scope</literal>: Starts a new scope block. Labels
-      that begin with an underscore are only reachable from within
-      their innermost enclosing <literal>.scope</literal> statement.</para></listitem>
-      <listitem><para><literal>.scend</literal>: Ends a scope block. Makes the
-      temporary labels defined since the last <literal>.scope</literal>
-      statement unreachable, and permits them to be redefined in a
-      new scope.</para></listitem>
-      <listitem><para><literal>.macro</literal> <emphasis>name</emphasis>: Begins a macro
-      definition block. This is a scope block that can be inlined
-      at arbitrary points with <literal>.invoke</literal>. Arguments to the
-      macro will be bound to temporary labels with names like
-      <literal>_1</literal>, <literal>_2</literal>, etc.</para></listitem>
-      <listitem><para><literal>.macend</literal>: Ends a macro definition
-      block.</para></listitem>
-      <listitem><para><literal>.invoke</literal> <emphasis>label</emphasis> [<emphasis>argument</emphasis> [,
-      <emphasis>argument</emphasis> ...]]: invokes (inlines) the specified
-      macro, binding the values of the arguments to the ones the
-      macro definition intends to read.  A shorthand for <literal>.invoke</literal>
-      is the name of the macro to invoke, backquoted.</para></listitem>
-         </itemizedlist>
+         <listitem>
+           <para>
+             <literal>.outfile</literal> <emphasis>filename</emphasis>:
+             Sets the filename for the output binary if one has not
+             already been set. If no name is ever set, the output will
+             be written to <literal>ophis.bin</literal>.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.advance</literal> <emphasis>address</emphasis>:
+             Forces the program counter to
+             be <emphasis>address</emphasis>. Unlike
+             the <literal>.org</literal>
+             directive, <literal>.advance</literal> outputs zeroes
+             until the program counter reaches a specified
+             address. Attempting to <literal>.advance</literal> to a
+             point behind the current program counter is an
+             assemble-time error.
+         </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.alias</literal> <emphasis>label</emphasis> <emphasis>value</emphasis>:
+             The .alias directive assigns an arbitrary value to a
+             label. This value may be an arbitrary argument, but
+             cannot reference any label that has not already been
+             defined (this prevents recursive label
+             dependencies).
+         </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.byte</literal> <emphasis>arg</emphasis> [
+             , <emphasis>arg</emphasis>, ... ]: Specifies a series of
+             arguments, which are evaluated, and strings, which are
+             included as raw ASCII data. The final results of these
+             arguments must be one byte in size. Seperate constants
+             are seperated by comments.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.checkpc</literal> <emphasis>address</emphasis>:
+             Ensures that the program counter is less than or equal to
+             the address specified, and emits an assemble-time error
+             if it is not. <emphasis>This produces no code in the
+             final binary - it is there to ensure that linking a large
+             amount of data together does not overstep memory
+             boundaries.</emphasis>
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.data</literal> <emphasis>[label]</emphasis>:
+             Sets the segment to the segment name specified and
+             disallows output. If no label is given, switches to the
+             default data segment.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.incbin</literal> <emphasis>filename</emphasis>:
+             Inserts the contents of the file specified as binary
+             data. Use it to include graphics information, precompiled
+             code, or other non-assembler data.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.include</literal> <emphasis>filename</emphasis>:
+             Includes the entirety of the file specified at that point
+             in the program.  Use this to order your final sources, if
+             you aren't doing it via the command line.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.org</literal> <emphasis>address</emphasis>:
+             Sets the program counter to the address
+             specified. <emphasis>This does not emit any code in and
+             of itself, nor does it overwrite anything that previously
+             existed.</emphasis> If you wish to jump ahead in memory,
+             use <literal>.advance</literal>.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.require</literal> <emphasis>filename</emphasis>:
+             Includes the entirety of the file specified at that point
+             in the program.  Unlike <literal>.include</literal>,
+             however, code included with <literal>.require</literal>
+             will only be inserted once.
+             The <literal>.require</literal> directive is useful for
+             ensuring that certain code libraries are somewhere in the
+             final binary.  They are also very useful for guaranteeing
+             that macro libraries are available.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.space</literal> <emphasis>label</emphasis> <emphasis>size</emphasis>:
+             This directive is used to organize global variables. It
+             defines the label specified to be at the current location
+             of the program counter, and then advances the program
+             counter <emphasis>size</emphasis> steps ahead. No actual
+             code is produced. This is equivalent to <literal>label:
+             .org ^+size</literal>.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.text</literal> <emphasis>[label]</emphasis>:
+             Sets the segment to the segment name specified and allows
+             output. If no label is given, switches to the default
+             text segment.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.word</literal> <emphasis>arg</emphasis> [
+             , <emphasis>arg</emphasis>, ... ]:
+             Like <literal>.byte</literal>, but values are all treated
+             as two-byte values and stored low-end first (as is the
+             6502's wont). Use this to create jump tables (an
+             unadorned label will evaluate to that label's location)
+             or otherwise store 16-bit data.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.dword</literal> <emphasis>arg</emphasis> [
+             , <emphasis>arg</emphasis>, ...]:
+             Like <literal>.word</literal>, but for 32-bit
+             values.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.wordbe</literal> <emphasis>arg</emphasis> [
+             , <emphasis>arg</emphasis>, ...]:
+             Like <literal>.word</literal>, but stores the value in a
+             big-endian format (high byte first).
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.dwordbe</literal> <emphasis>arg</emphasis> [
+             , <emphasis>arg</emphasis>, ...]:
+             Like <literal>.dword</literal>, but stores the value high
+             byte first.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.scope</literal>: Starts a new scope
+             block. Labels that begin with an underscore are only
+             reachable from within their innermost
+             enclosing <literal>.scope</literal>
+             statement.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.scend</literal>: Ends a scope block. Makes the
+             temporary labels defined since the
+             last <literal>.scope</literal> statement unreachable, and
+             permits them to be redefined in a new
+             scope.
+           </para>
+         </listitem>
+         <listitem>
+           <para>
+             <literal>.macro</literal> <emphasis>name</emphasis>:
+             Begins a macro definition block. This is a scope block
+             that can be inlined at arbitrary points
+             with <literal>.invoke</literal>. Arguments to the macro
+             will be bound to temporary labels with names like
+             <literal>_1</literal>, <literal>_2</literal>, etc.
+           </para>
+         </listitem>
+      <listitem>
+        <para>
+          <literal>.macend</literal>: Ends a macro definition block.
+        </para>
+      </listitem>
+      <listitem>
+        <para>
+          <literal>.invoke</literal> <emphasis>label</emphasis> [<emphasis>argument</emphasis> [,
+          <emphasis>argument</emphasis> ...]]: invokes (inlines) the
+          specified macro, binding the values of the arguments to the
+          ones the macro definition intends to read.  A shorthand
+          for <literal>.invoke</literal> is the name of the macro to
+          invoke, backquoted.
+        </para>
+      </listitem>
+    </itemizedlist>
  </section>
 </appendix>
--- a/doc/hll1.sgm
+++ b/doc/hll1.sgm
@ -0,0 +1,185 @@
+<chapter id="hll-1">
+<title>The Second Step</title>
+
+<para>
+  This essay discusses how to do 16-or-more bit addition and
+  subtraction on the 6502, and how to do unsigned comparisons
+  properly, thus making 16-bit arithmetic less necessary.
+</para>
+
+<section>
+  <title>The problem</title>
+  <para>
+    The <literal>ADC</literal>, <literal>SBC</literal>, <literal>INX</literal>,
+    and <literal>INY</literal> instructions are the only real
+    arithmetic instructions the 6502 chip has.  In and of themselves,
+    they aren't too useful for general applications: the accumulator
+    can only hold 8 bits, and thus can't store any value over 255.
+    Matters get even worse when we're branching based on
+    values; <literal>BMI</literal> and <literal>BPL</literal> hinge on
+    the seventh (sign) bit of the result, so we can't represent any
+    value above 127.
+  </para>
+</section>
+
+<section>
+  <title>The solution</title>
+
+  <para>
+    We have two solutions available to us.  First, we can use
+    the <quote>unsigned</quote> discipline, which involves checking
+    different flags, but lets us deal with values between 0 and 255
+    instead of -128 to 127.  Second, we can trade speed and register
+    persistence for multiple precision arithmetic, using 16-bit
+    integers (-32768 to 32767, or 0-65535), 24-bit, or more.
+  </para>
+
+  <para>
+    Multiplication, division, and floating point arithmetic are beyond
+    the scope of this essay.  The best way to deal with those is to
+    find a math library on the web (I
+    recommend <ulink url="http://www.6502.org/"></ulink>) and use the
+    routines there.
+  </para>
+</section>
+
+<section>
+  <title>Unsigned arithmetic</title>
+  <para>
+    When writing control code that hinges on numbers, we should always
+    strive to have our comparison be with zero; that way, no explicit
+    compare is necessary, and we can branch simply
+    with <literal>BEQ/BNE</literal>, which test the zero flag.
+    Otherwise, we use <literal>CMP</literal>.
+    The <literal>CMP</literal> command subtracts its argument from the
+    accumulator (without borrow), updates the flags, but throws away
+    the result.  If the value is equal, the result is zero.
+    (<literal>CMP</literal> followed by <literal>BEQ</literal>
+    branches if the argument is equal to the accumulator; this is
+    probably why it's called <literal>BEQ</literal> and not something
+    like <literal>BZS</literal>.)
+  </para>
+  <para>
+    Intuitively, then, to check if the accumulator is <emphasis>less
+    than</emphasis> some value, we <literal>CMP</literal> against that
+    value and <literal>BMI</literal>.  The <literal>BMI</literal>
+    command branches based on the Negative Flag, which is equal to the
+    seventh bit of <literal>CMP</literal>'s subtract.  That's exactly
+    what we need, for signed arithmetic.  However, this produces
+    problems if you're writing a boundary detector on your screen or
+    something and find that 192 &lt; 4.  192 is outside of a signed
+    byte's range, and is interpreted as if it were -64.  This will not
+    do for most graphics applications, where your values will be
+    ranging from 0-319 or 0-199 or 0-255.
+  </para>
+
+  <para>
+    Instead, we take advantage of the implied subtraction
+    that <literal>CMP</literal> does.  When subtracting, the result's
+    carry bit starts at 1, and gets borrowed from if necessary.  Let
+    us consider some four-bit subtractions.
+  </para>
+
+<programlisting>
+C|3210       C|3210
+------       ------
+1|1001    9  1|1001    9
+ |0100  - 4   |1100  -12
+------  ---  ------  ---
+1|0101    5  0|1101   -3
+</programlisting>
+
+  <para>
+    The <literal>CMP</literal> command properly modifies the carry bit
+    to reflect this.  When computing A-B, the carry bit is set if A
+    &gt;= B, and it's clear if A &lt; B.  Consider the following two
+    code sequences.
+  </para>
+
+<programlisting>
+    (1)                  (2)
+  CMP #$C0            CMP #$C0
+  BMI label           BCC label
+</programlisting>
+
+  <para>
+    The code in the first column treats the value in the accumulator
+    as a signed value, and branches if the value is less than -64.
+    (Because of overflow issues, it will actually branch for
+    accumulator values between $40 and $BF, even though it *should*
+    only be doing it for values between $80 and $BF.  To see why,
+    compare $40 to $C0 and look at the result.)  The second column
+    code treats the accumulator as holding an unsigned value, and
+    branches if the value is less than 192.  It will branch for
+    accumulator values $00-$BF.
+  </para>
+</section>
+
+<section>
+  <title>16-bit addition and subtraction</title>
+
+  <para>
+    Time to use the carry bit for what it was meant to do.  Adding two
+    8 bit numbers can produce a 9-bit result.  That 9th bit is stored
+    in the carry flag.  The <literal>ADC</literal> command adds the
+    carry value to its result, as well.  Thus, carries work just as
+    we'd expect them to.  Suppose we're storing two 16-bit values, low
+    byte first, in $C100-1 and $C102-3.  To add them together and
+    store them in $C104-5, this is very easy:
+  </para>
+
+<programlisting>
+  CLC
+  LDA $C100
+  ADC $C102
+  STA $C104
+  LDA $C101
+  ADC $C103
+  STA $C105
+</programlisting>
+
+  <para>
+    Subtraction is identical, but you set the carry bit first
+    with <literal>SEC</literal> (because borrow is the complement of
+    carry&mdash;think about how the unsigned compare works if this
+    puzzles you) and, of course, using the <literal>SBC</literal>
+    instruction instead of <literal>ADC</literal>.
+  </para>
+
+  <para>
+    The carry/borrow bit is set appropriately to let you continue,
+    too.  As long as you just keep working your way up to bytes of
+    ever-higher significance, this generalizes to 24 (do it three
+    times instead of two) or 32 (four, etc.) bit integers.
+  </para>
+</section>
+
+<section>
+  <title>16-bit comparisons</title>
+
+  <para>
+    Doing comparisons on extended precision values is about the same
+    as doing them on 8-bit values, but you have to have the value you
+    test in memory, since it won't fit in the accumulator all at once.
+    You don't have to store the values back anywhere, either, since
+    all you care about is the final state of the flags.  For example,
+    here's a signed comparison, branching to <literal>label</literal>
+    if the value in $C100-1 is less than 1000 ($03E8):
+  </para>
+
+<programlisting>
+  SEC
+  LDA $C100
+  SBC #$E8
+  LDA $C101    ; We only need the carry bit from that subtract
+  SBC #$03
+  BMI label
+</programlisting>
+
+  <para>
+    All the commentary on signed and unsigned compares holds for
+    16-bit (or higher) integers just as it does for the 8-bit
+    ones.
+  </para>
+</section>
+</chapter>
--- a/doc/hll2.sgm
+++ b/doc/hll2.sgm
@ -0,0 +1,880 @@
+<chapter id="hll2">
+<title>Structured Programming</title>
+
+<para>
+  This essay discusses the machine language equivalents of the
+  basic <quote>structured programming</quote> concepts that are part
+  of the <quote>imperative</quote> family of programming languages:
+  if/then/else, for/next, while loops, and procedures.  It also
+  discusses basic use of variables, as well as arrays, multi-byte data
+  types (records), and sub-byte data types (bitfields).  It closes by
+  hand-compiling pseudo-code for an insertion sort on linked lists
+  into assembler.  A complete Commodore 64 application is included as
+  a sample with this essay.
+</para>
+
+<section>
+  <title>Control constructs</title>
+  <section>
+    <title>Branches: <literal>if x then y else z</literal></title>
+
+    <para>
+      This is almost the most basic control construct.
+      The <emphasis>most</emphasis> basic is <literal>if x then
+      y</literal>, which is a simple branch instruction
+      (bcc/bcs/beq/bmi/bne/bpl/bvc/bvs) past the <quote>then</quote>
+      clause if the conditional is false:
+    </para>
+
+<programlisting>
+   iny
+   bne no'overflow
+   inx
+no'overflow:
+   ;; rest of code
+</programlisting>
+
+    <para>
+      This increments the value of the y register, and if it just
+      wrapped back around to zero, it increments the x register too.
+      It is basically equivalent to the C statement <literal>if
+      ((++y)==0) ++x;</literal>.  We need a few more labels to handle
+      else clauses as well.
+    </para>
+
+<programlisting>
+   ;; Computation of the conditional expression.
+   ;; We assume for the sake of the example that
+   ;; we want to execute the THEN clause if the
+   ;; zero bit is set, otherwise the ELSE
+   ;; clause.  This will happen after a CMP,
+   ;; which is the most common kind of 'if'
+   ;; statement anyway.
+
+   BNE else'clause
+
+   ;; THEN clause code goes here.
+
+   JMP end'of'if'stmt
+else'clause:
+
+   ;; ELSE clause code goes here.
+
+end'of'if'stmt:
+   ;; ... rest of code.
+</programlisting>
+  </section>
+  <section>
+    <title>Free loops: <literal>while x do y</literal></title>
+
+    <para>
+      A <emphasis>free loop</emphasis> is one that might execute any
+      number of times.  These are basically just a combination
+      of <literal>if</literal> and <literal>goto</literal>.  For
+      a <quote>while x do y</quote> loop, that executes zero or more
+      times, you'd have code like this...
+    </para>
+
+<programlisting>
+loop'begin:
+   ;; ... computation of condition, setting zero
+   ;;     bit if loop is finished...
+   beq loop'done
+   ;; ... loop body goes here
+   jmp loop'begin
+loop'done:
+   ;; ... rest of program.
+</programlisting>
+
+    <para>
+      If you want to ensure that the loop body executes at least once
+      (do y while x), just move the test to the end.
+    </para>
+
+<programlisting>
+loop'begin:
+   ;; ... loop body goes here
+   ;; ... computation of condition, setting zero
+   ;;     bit if loop is finished...
+   bne loop'begin
+   ;; ... rest of program.
+</programlisting>
+
+    <para>
+      The choice of zero bit is kind of arbitrary here.  If the
+      condition involves the carry bit, or overflow, or negative, then
+      replace the beq with bcs/bvs/bmi appropriately.
+    </para>
+  </section>
+  <section>
+    <title>Bounded loops: <literal>for i = x to y do z</literal></title>
+
+    <para>
+      A special case of loops is one where you know exactly how many
+      times you're going through it&mdash;this is called
+      a <emphasis>bounded</emphasis> loop.  Suppose you're copying 16
+      bytes from $C000 to $D000.  The C code for that would look
+      something like this:
+    </para>
+
+<programlisting>
+   int *a = 0xC000;
+   int *b = 0xD000;
+   int i;
+   for (i = 0; i < 16; i++) { a[i] = b[i]; }
+</programlisting>
+
+    <para>
+      C doesn't directly support bounded loops;
+      its <literal>for</literal> statement is just <quote>syntactic
+      sugar</quote> for a while statement.  However, we can take
+      advantage of special purpose machine instructions to get very
+      straightforward code:
+    </para>
+
+<programlisting>
+   ldx #$00
+loop:
+   lda $c000, x
+   sta $d000, x
+   inx
+   cpx #$10
+   bmi loop
+</programlisting>
+
+    <para>
+      However, remember that every arithmetic operation,
+      including <literal>inx</literal> and <literal>dex</literal>,
+      sets the various flags, including the Zero bit.  That means that
+      if we can make our computation <emphasis>end</emphasis> when the
+      counter hits zero, we can shave off some bytes:
+    </para>
+
+<programlisting>
+   ldx #$10
+loop:
+   lda #$bfff, x
+   sta #$cfff, x
+   dex
+   bne loop
+</programlisting>
+
+    <para>
+      Notice that we had to change the addresses we're indexing from,
+      because x takes a slightly different range of values.  The space
+      savings is small here, and it's become slightly more unclear.
+      (It also hasn't actually saved any time, because the lda and sta
+      instructions are crossing a page boundary where they weren't
+      before&mdash;but if the start or end arrays began at $b020 or
+      something this wouldn't be an issue.)  This tends to work better
+      when the precise value of the counter isn't used in the
+      computation&mdash;so let us consider the NES, which uses memory
+      location $2007 as a port to its video memory.  Suppose we wish
+      to jam 4,096 copies of the hex value $20 into the video memory.
+      We can write this <emphasis>very</emphasis> cleanly, using the X
+      and Y registers as indices in a nested loop.
+    </para>
+
+<programlisting>
+   ldx #$10
+   ldy #$00
+   lda #$20
+loop:
+   sta $2007
+   iny
+   bne loop
+   dex
+   bne loop
+</programlisting>
+
+    <para>
+      Work through this code.  Convince yourself that
+      the <literal>sta</literal> is executed exactly 16*256 = 4096
+      times.
+    </para>
+
+    <para>
+      This is an example of a <emphasis>nested</emphasis> loop: a loop
+      inside a loop.  Since our internal loop didn't need the X or Y
+      registers, we got to use both of them, which is nice, because
+      they have special incrementing and decrementing instructions.
+      The accumulator lacks these instructions, so it is a poor choice
+      to use for index variables.  If you have a bounded loop and
+      don't have access to registers, use memory locations
+      instead:
+    </para>
+
+<programlisting>
+   lda #$10
+   sta counter  ; loop 16 times
+loop:
+   ;; Do stuff that trashes all the registers
+   dec counter
+   bne loop
+</programlisting>
+
+    <para>
+      That's it!  These are the basic control constructs for using
+      inside of procedures.  Before talking about how to organize
+      procedures, I'll briefly cover the way the 6502 handles its
+      stack, because stacks and procedures are very tightly
+      intertwined.
+    </para>
+  </section>
+</section>
+
+<section>
+  <title>The stack</title>
+
+  <para>
+    The 6502 has an onboard stack in page 1.  You can modify the stack
+    pointer by storing values in X register and
+    using <literal>txs</literal>; an <quote>empty</quote> stack is
+    value $FF.  Going into a procedure pushes the address of the next
+    instruction onto the stack, and RTS pops that value off and jumps
+    there.  (Well, not precisely.  JSR actually pushes a value that's
+    one instruction short, and RTS loads the value, increases it by
+    one, and THEN jumps there.  But that's only an issue if you're
+    using RTS to implement jump tables.)  On an interrupt, the next
+    instruction's address is pushed on the stack, then the process
+    flags, and it jumps to the handler.  The return from interrupt
+    restores the flags and the PC, just as if nothing had
+    happened.
+  </para>
+
+  <para>
+    The stack only has 256 possible entries; since addresses take two
+    bytes to store, that means that if you call something that calls
+    something that calls something that (etc., etc., 129 times), your
+    computation will fail.  This can happen faster if you save
+    registers or memory values on the stack (see below).
+  </para>
+</section>
+<section>
+  <title>Procedures and register saving</title>
+
+  <para>
+    All programming languages are designed around the concept of
+    procedures.<footnote><para>Yes, all of them. Functional languages
+    just let you do more things with them, logic programming has
+    implicit calls to query procedures, and
+    object-oriented <quote>methods</quote> are just normal procedures
+    that take one extra argument in secret.</para></footnote>
+    Procedures let you break a computation up into different parts,
+    then use them independently.  However, compilers do a lot of work
+    for you behind the scenes to let you think this.  Consider the
+    following assembler code.  How many times does the loop
+    execute?
+  </para>
+
+<programlisting>
+loop: ldx #$10 jsr do'stuff dex bne loop
+</programlisting>
+
+  <para>
+    The correct answer is <quote>I don't know, but
+    it <emphasis>should</emphasis> be 16.</quote> The reason we don't
+    know is because we're assuming here that
+    the <literal>do'stuff</literal> routine doesn't change the value
+    of the X register.  If it does, than all sorts of chaos could
+    result.  For major routines that aren't called often but are
+    called in places where the register state is important, you should
+    store the old registers on the stack with code like this:
+  </para>
+
+<programlisting>
+do'stuff:
+   pha
+   txa
+   pha
+   tya
+   pha
+
+   ;; Rest of do'stuff goes here
+
+   pla
+   tay
+   pla
+   tax
+   pla
+   rts
+</programlisting>
+
+  <para>
+    (Remember, the last item pushed onto the stack is the first one
+    pulled off, so you have to restore them in reverse order.)  That's
+    three more bytes on the stack, so you don't want to do this if you
+    don't absolutely have to.  If <literal>do'stuff</literal>
+    actually <emphasis>doesn't</emphasis> touch X, there's no need to
+    save and restore the value.  This technique is
+    called <emphasis>callee-save</emphasis>.
+  </para>
+
+  <para>
+    The reverse technique is called <emphasis>caller-save</emphasis>
+    and pushes important registers onto the stack before the routine
+    is called, then restores them afterwards.  Each technique has its
+    advantages and disadvantages.  The best way to handle it in your
+    own code is to mark at the top of each routine which registers
+    need to be saved by the caller.  (It's also useful to note things
+    like how it takes arguments and how it returns values.)
+  </para>
+</section>
+<section>
+  <title>Variables</title>
+
+  <para>
+    Variables come in several flavors.
+  </para>
+
+  <section>
+    <title>Global variables</title>
+
+    <para>
+      Global variables are variables that can be reached from any
+      point in the program.  Since the 6502 has no memory protection,
+      these are easy to declare.  Take some random chunk of unused
+      memory and declare it to be the global variables area.  All
+      reasonable assemblers have commands that let you give a symbolic
+      name to a memory location&mdash;you can use this to give your
+      globals names.
+    </para>
+  </section>
+
+  <section>
+    <title>Local variables</title>
+
+    <para>
+      All modern languages have some concept of <quote>local
+      variables</quote>, which are data values unique to that
+      invocation of that procedure.  In modern architecures, this data
+      is stored into and read directly off of the stack.  The 6502
+      doesn't really let you do this cleanly; I'll discuss ways of
+      handling it in a later essay.  If you're implementing a system
+      from scratch, you can design your memory model to not require
+      such extreme measures.  There are three basic techniques.
+    </para>
+
+    <section>
+      <title>Treat local variables like registers</title>
+      <para>
+        This means that any memory location you use, you save on the
+        stack and restore afterwards.  This
+        can <emphasis>really</emphasis> eat up stack space, and it's
+        really slow, it's often pointless, and it has a tendency to
+        overflow the stack.  I can't recommend it.  But it does let
+        you do recursion right, if you don't need to save much memory
+        and you aren't recursing very deep.
+      </para>
+    </section>
+    <section>
+      <title>Procedure-based memory allocation</title>
+      <para>
+        With this technique, you give each procedure its own little
+        chunk of memory for use with its data.  All the variables are
+        still, technically, globals; a
+        routine <emphasis>could</emphasis> interfere with another's,
+        but the discipline of <quote>only mess with real globals, and
+        your own locals</quote> is very, very easy to maintain.
+      </para>
+
+      <para>
+        This has many advantages.  It's <emphasis>very</emphasis>
+        fast, both to write and to run, because loading a variable is
+        an Absolute or Zero Page instruction.  Also, any procedure may
+        call any other procedure, as long as it doesn't wind up
+        calling itself at some point.
+      </para>
+
+      <para>
+        It has two major disadvantages.  First, if many routines need
+        a lot of space, it can consume more memory than it should.
+        Also, this technique can require significant assembler
+        support&mdash;you must ensure that no procedure's local
+        variables are defined in the same place as any other
+        procedure, and it essentially requires a full symbolic linker
+        to do right.  Ophis includes commands for <emphasis>memory
+        segmentation simulation</emphasis> that automate most of this
+        task, and make writing general libraries feasible.
+      </para>
+    </section>
+    <section>
+      <title>Partition-based memory allocation</title>
+
+      <para>
+        It's not <emphasis>really</emphasis> necessary that no
+        procedure overwrite memory used by any other procedure.  It's
+        only required that procedures don't write on the memory that
+        their <emphasis>callers</emphasis> use.  Suppose that your
+        program is organized into a bunch of procedures, and each fall
+        into one of three sets:
+      </para>
+
+      <itemizedlist>
+        <listitem><para>Procedures in set A don't call anyone.</para></listitem>
+        <listitem><para>Procedures in set B only call procedures in set A.</para></listitem>
+        <listitem><para>Procedures in set C only call procedures in sets A or B.</para></listitem>
+      </itemizedlist>
+
+      <para>
+        Now, each <emphasis>set</emphasis> can be given its own chunk
+        of memory, and we can be absolutely sure that no procedures
+        overwrite each other.  Even if every procedure in set C uses
+        the <emphasis>same</emphasis> memory location, they'll never
+        step on each other, because there's no way to get to any other
+        routine in set C <emphasis>from</emphasis> any routine in set
+        C.
+      </para>
+
+      <para>
+        This has the same time efficiencies as procedure-based memory
+        allocation, and, given a thoughtful design aimed at using this
+        technique, also can use significantly less memory at run time.
+        It's also requires much less assembler support, as addresses
+        for variables may be assigned by hand without having to worry
+        about those addresses already being used.  However, it does
+        impose a very tight discipline on the design of the overall
+        system, so you'll have to do a lot more work before you start
+        actually writing code.
+      </para>
+    </section>
+  </section>
+  <section>
+    <title>Constants</title>
+
+    <para>
+      Constants are <quote>variables</quote> that don't change.  If
+      you know that the value you're using is not going to change, you
+      should fold it into the code, either as an Immediate operand
+      wherever it's used, or (if it's more complicated than that)
+      as <literal>.byte</literal> commands in between the procedures.
+      This is especially important for ROM-based systems such as the
+      NES; the NES has very little RAM available, so constants should
+      be kept in the more plentiful ROM wherever possible.
+    </para>
+  </section>
+</section>
+
+<section>
+  <title>Data structures</title>
+
+  <para>
+    So far, we've been treating data as a bunch of one-byte values.
+    There really isn't a lot you can do just with bytes.  This section
+    talks about how to deal with larger and smaller elements.
+  </para>
+
+  <section>
+    <title>Arrays</title>
+
+    <para>
+      An <emphasis>array</emphasis> is a bunch of data elements in a
+      row.  An array of bytes is very easy to handle with the 6502
+      chip, because the various indexed addressing modes handle it for
+      you.  Just load the index into the X or Y register and do an
+      absolute indexed load.  In general, these are going to be
+      zero-indexed (that is, a 32-byte array is indexed from 0 to 31.)
+      This code would initialize a byte array with 32 entries to
+      0:
+    </para>
+
+<programlisting>
+   lda #$00
+   tax
+loop:
+   sta array,x
+   inx
+   cpx #$20
+   bne loop
+</programlisting>
+
+    <para>
+      (If you count down to save instructions, remember to adjust the
+      base address so that it's still writing the same memory
+      location.)
+    </para>
+
+    <para>
+      This approach to arrays has some limits.  Primary among them is
+      that we can't have arrays of size larger than 256; we can't fit
+      our index into the index register.  In order to address larger
+      arrays, we need to use the indirect indexed addressing mode.  We
+      use 16-bit addition to add the offset to the base pointer, then
+      set the Y register to 0 and then load the value
+      with <literal>lda (ptr),y</literal>.
+    </para>
+
+    <para>
+      Well, actually, we can do better than that.  Suppose we want to
+      clear out 8K of ram, from $2000 to $4000.  We can use the Y
+      register to hold the low byte of our offset, and only update the
+      high bit when necessary.  That produces the following
+      loop:
+    </para>
+
+<programlisting>
+   lda #$00  ; Set pointer value to base ($2000)
+   sta ptr
+   lda #$20
+   sta ptr+1
+   lda #$00  ; Storing a zero
+   ldx #$20  ; 8,192 ($2000) iterations: high byte
+   ldy #$00  ; low byte.
+loop:
+   sta (ptr),y
+   iny
+   bne loop  ; If we haven't wrapped around, go back
+   inc ptr+1 ; Otherwise update high byte
+   dex       ; bump counter
+   bne loop  ; and continue if we aren't done
+</programlisting>
+
+    <para>
+      This code could be optimized further; the loop prelude in
+      particular loads a lot of redundant values that could be
+      compressed down further:
+    </para>
+
+<programlisting>
+   lda #$00
+   tay
+   ldx #$20
+   sta ptr
+   stx ptr+1
+</programlisting>
+
+    <para>
+      That's not directly relevant to arrays, but these sorts of
+      things are good things to keep in mind when writing your code.
+      Done well, they can make it much smaller and faster; done
+      carelessly, they can force a lot of bizarre dependencies on your
+      code and make it impossible to modify later.
+    </para>
+  </section>
+  <section>
+    <title>Records</title>
+    <para>
+      A <emphasis>record</emphasis> is a collection of values all
+      referred to as one variable.  This has no immediate
+      representation in assembler.  If you have a global variable
+      that's two bytes and a code pointer, this is exactly equivalent
+      to three seperate variables.  You can just put one label in
+      front of it, and refer to the first byte
+      as <literal>label</literal>, the second
+      as <literal>label+1</literal>, and the code pointer
+      a <literal>label+2</literal>.
+    </para>
+
+    <para>
+      This really applies to all data structures that take up more
+      than one byte.  When dealing with the pointer, a 16-bit value,
+      we refer to the low byte as <literal>ptr</literal>
+      (or <literal>label+2</literal>, in the example above), and the
+      high byte as <literal>ptr+1</literal>
+      (or <literal>label+3</literal>).
+    </para>
+
+    <para>
+      Arrays of records are more interesting.  There are two
+      possibilities for these.  The way most high level languages
+      treat it is by keeping the records contiguous.  If you have an
+      array of two sixteen bit integers, then the records are stored
+      in order, one at a time.  The first is in location $1000, the
+      next in $1004, the next in $1008, and so on.  You can do this
+      with the 6502, but you'll probably have to use the indirect
+      indexed mode if you want to be able to iterate
+      conveniently.
+    </para>
+
+    <para>
+      Another, more unusual, but more efficient approach is to keep
+      each byte as a seperate array, just like in the arrays example
+      above.  To illustrate, here's a little bit of code to go through
+      a contiguous array of 16 bit integers, adding their values to
+      some <literal>total</literal> variable:
+    </para>
+
+<programlisting>
+   ldx #$10  ; Number of elements in the array
+   ldy #$00  ; Byte index from array start
+loop:
+   clc
+   lda array, y      ; Low byte
+   adc total
+   sta total
+   lda array+1, y    ; High byte
+   adc total+1
+   sta total+1
+   iny               ; Jump ahead to next entry
+   iny
+   dex               ; Check for loop termination
+   bne loop
+</programlisting>
+
+    <para>
+      And here's the same loop, keeping the high and low bytes in
+      seperate arrays:
+    </para>
+
+<programlisting>
+   ldx #$00
+loop:
+   clc
+   lda lowbyte,x
+   adc total
+   sta total
+   lda highbyte,x
+   adc total+1
+   sta total+1
+   inx
+   cpx #$10
+   bne loop
+</programlisting>
+
+    <para>
+      Which approach is the right one depends on what you're doing.
+      For large arrays, the first approach is better, as you only need
+      to maintain one base pointer.  For smaller arrays, the easier
+      indexing makes the second approach more convenient.
+    </para>
+  </section>
+  <section>
+    <title>Bitfields</title>
+
+    <para>
+      To store values that are smaller than a byte, you can save space
+      by putting multiple values in a byte.  To extract a sub-byte
+      value, use the bitmasking commands:
+    </para>
+
+    <itemizedlist>
+      <listitem><para>To set bits, use the <literal>ORA</literal> command.  <literal>ORA #$0F</literal> sets the lower four bits to 1 and leaves the rest unchanged.</para></listitem>
+      <listitem><para>To clear bits, use the <literal>AND</literal> command.  <literal>AND #$F0</literal> sets the lower four bits to 0 and leaves the rest unchanged.</para></listitem>
+      <listitem><para>To reverse bits, use the <literal>EOR</literal> command.  <literal>EOR #$0F</literal> reverses the lower four bits and leaves the rest unchanged.</para></listitem>
+      <listitem><para>To test if a bit is 0, AND away everything but that bit, then see if the Zero bit was set.  If the bit is in the top two bits of a memory location, you can use the BIT command instead (which stores bit 7 in the Negative bit, and bit 6 in the Overflow bit).</para></listitem>
+    </itemizedlist>
+  </section>
+</section>
+
+<section>
+  <title>A modest example: Insertion sort on linked lists</title>
+
+  <para>
+    To demonstrate these techniques, we will now produce code to
+    perform insertion sort on a linked list.  We'll start by defining
+    our data structure, then defining the routines we want to write,
+    then producing actual code for those routines.  A downloadable
+    version that will run unmodified on a Commodore 64 closes the
+    chapter.
+  </para>
+
+  <section>
+    <title>The data structure</title>
+
+    <para>
+      We don't really want to have to deal with pointers if we can
+      possibly avoid it, but it's hard to do a linked list without
+      them.  Instead of pointers, we will
+      use <emphasis>cursors</emphasis>: small integers that represent
+      the index into the array of values.  This lets us use the
+      many-small-byte-arrays technique for our data.  Furthermore, our
+      random data that we're sorting never has to move, so we may
+      declare it as a constant and only bother with changing the
+      values of <literal>head</literal> and
+      the <literal>next</literal> arrays.  The data record definition
+      looks like this:
+    </para>
+
+<programlisting>
+  head : byte;
+  data : const int[16] = [838, 618, 205, 984, 724, 301, 249, 946,
+                          925,  43, 114, 697, 985, 633, 312,  86];
+  next : byte[16];
+</programlisting>
+
+    <para>
+      Exactly how this gets represented will vary from assembler to
+      assembler.  Ophis does it like this:
+    </para>
+
+<programlisting>
+.data
+.space head 1
+.space next 16
+
+.text
+lb:   .byte &lt;$838,&lt;$618,&lt;$205,&lt;$984,&lt;$724,&lt;$301,&lt;$249,&lt;$946
+      .byte &lt;$925,&lt;$043,&lt;$114,&lt;$697,&lt;$985,&lt;$633,&lt;$312,&lt;$086
+hb:   .byte >$838,>$618,>$205,>$984,>$724,>$301,>$249,>$946
+      .byte >$925,>$043,>$114,>$697,>$985,>$633,>$312,>$086
+</programlisting>
+  </section>
+  <section>
+    <title>Doing an insertion sort</title>
+
+    <para>
+      To do an insertion sort, we clear the list by setting the 'head'
+      value to -1, and then insert each element into the list one at a
+      time, placing each element in its proper order in the list.  We
+      can consider the lb/hb structure alone as an array of 16
+      integers, and just insert each one into the list one at a
+      time.
+    </para>
+
+<programlisting>
+procedure insertion_sort
+  head := -1;
+  for i := 0 to 15 do
+    insert_elt i
+  end
+end
+</programlisting>
+
+    <para>
+      This translates pretty directly.  We'll have insert_elt take its
+      argument in the X register, and loop with that.  However, given
+      that insert_elt is going to be a complex procedure, we'll save
+      the value first.  The assembler code becomes:
+    </para>
+
+<programlisting>
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insertion'sort:  Sorts the list defined by head, next, hb, lb.
+; Arguments:  None.
+; Modifies:   All registers destroyed, head and next array sorted.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+insertion'sort:
+        lda #$FF        ; Clear list by storing the terminator in 'head'
+        sta head
+        ldx #$0         ; Loop through the lb/hb array, adding each
+insertion'sort'loop:    ; element one at a time
+        txa
+        pha
+        jsr insert_elt
+        pla
+        tax
+        inx
+        cpx #$10
+        bne insertion'sort'loop
+        rts
+</programlisting>
+  </section>
+  <section>
+    <title>Inserting an element</title>
+
+    <para>
+      The pseudocode for inserting an element is a bit more
+      complicated.  If the list is empty, or the value we're inserting
+      goes at the front, then we have to update the value
+      of <literal>head</literal>.  Otherwise, we can iterate through
+      the list until we find the element that our value fits in after
+      (so, the first element whose successor is larger than our
+      value).  Then we update the next pointers directly and exit.
+    </para>
+
+<programlisting>
+procedure insert_elt i
+begin
+   if head = -1 then begin
+      head := i;
+      next[i] := -1;
+      return;
+   end;
+   val := data[i];
+   if val < data[i] then begin
+      next[i] := head;
+      head := i;
+      return;
+   end;
+   current := head;
+   while (next[current] &lt;&gt; -1 and val &lt; data[next[current]]) do
+      current := next[current];
+   end;
+   next[i] := next[current];
+   next[current] := i;
+end;
+</programlisting>
+
+    <para>
+      This produces the following rather hefty chunk of code:
+    </para>
+
+<programlisting>
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insert_elt: Insert an element into the linked list.  Maintains the
+;             list in sorted, ascending order.  Used by
+;             insertion'sort.
+; Arguments:  X register holds the index of the element to add.
+; Modifies:   All registers destroyed; head and next arrays updated
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data
+.space lbtoinsert 1
+.space hbtoinsert 1
+.space indextoinsert 1
+
+.text
+
+insert_elt:
+        ldy head                        ; If the list is empty, make
+        cpy #$FF                        ; head point at it, and return.
+        bne insert_elt'list'not'empty
+        stx head
+        tya
+        sta next,x
+        rts
+insert_elt'list'not'empty:
+        lda lb,x                        ; Cache the data we're inserting
+        sta lbtoinsert
+        lda hb,x
+        sta hbtoinsert
+        stx indextoinsert
+        ldy head                        ; Compare the first value with
+        sec                             ; the data.  If the data must
+        lda lb,y                        ; be inserted at the front...
+        sbc lbtoinsert
+        lda hb,y
+        sbc hbtoinsert
+        bmi insert_elt'not'smallest
+        tya                             ; Set its next pointer to the
+        sta next,x                      ; old head, update the head
+        stx head                        ; pointer, and return.
+        rts
+insert_elt'not'smallest:
+        ldx head
+insert_elt'loop:                        ; At this point, we know that
+        lda next,x                      ; argument > data[X].
+        tay
+        cpy #$FF                        ; if next[X] = #$FF, insert arg at end.
+        beq insert_elt'insert'after'current
+        lda lb,y                        ; Otherwise, compare arg to
+        sec                             ; data[next[X]].  If we insert
+        sbc lbtoinsert                  ; before that...
+        lda hb,y
+        sbc hbtoinsert
+        bmi insert_elt'goto'next
+insert_elt'insert'after'current:        ; Fix up all the next links
+        tya
+        ldy indextoinsert
+        sta next,y
+        tya
+        sta next,x
+        rts                             ; and return.
+insert_elt'goto'next:                   ; Otherwise, let X = next[X]
+        tya                             ; and go looping again.
+        tax
+        jmp insert_elt'loop
+</programlisting>
+  </section>
+  <section>
+    <title>The complete application</title>
+
+    <para>
+      The full application, which deals with interfacing with CBM
+      BASIC and handles console I/O and such, is
+      in <xref linkend="structure-src" endterm="structure-fname">.
+    </para>
+  </section>
+</section>
+</chapter>
--- a/doc/hll3.sgm
+++ b/doc/hll3.sgm
@ -0,0 +1,297 @@
+<chapter id="hll3">
+<title>Pointers and Indirection</title>
+
+<para>
+  The basics of pointers versus cursors (or, at the 6502 assembler
+  level, the indirect indexed addressing mode versus the absolute
+  indexed ones) were covered in <xref linkend="hll2"> This essay seeks
+  to explain the uses of the indirect modes, and how to implement
+  pointer operations with them.  It does <emphasis>not</emphasis> seek to explain
+  why you'd want to use pointers for something to begin with; for a
+  tutorial on proper pointer usage, consult any decent C textbook.
+</para>
+
+<section>
+  <title>The absolute basics</title>
+
+  <para>
+    A pointer is a variable holding the address of a memory location.
+    Memory locations take 16 bits to represent on the 6502: thus, we
+    need two bytes to hold it.  Any decent assembler will have ways of
+    taking the high and low bytes of an address; use these to acquire
+    the raw values you need.  The 6502 chip does not have any
+    simple <quote>pure</quote> indirect modes (except
+    for <literal>JMP</literal>, which is a matter for a later essay);
+    all are indexed, and they're indexed different ways depending on
+    which index register you use.
+  </para>
+
+  <section>
+    <title>The simplest example</title>
+
+    <para>
+      When doing a simple, direct dereference (that is, something
+      equivalent to the C code <literal>c=*b;</literal>) the code
+      looks like this:
+    </para>
+
+<programlisting>
+        ldy #0
+        lda (b), y
+        sta c
+</programlisting>
+
+    <para>
+      Even with this simple example, there are several important
+      things to notice.
+    </para>
+
+    <itemizedlist>
+      <listitem>
+        <para>
+          The variable <literal>b</literal> <emphasis>must be on the
+            zero page</emphasis>, and furthermore, it <emphasis>cannot
+            be $FF.</emphasis> All your pointer values need to be
+            either stored on the zero page to begin with or copied
+            there before use.
+        </para>
+      </listitem>
+      <listitem>
+        <para>
+          The <literal>y</literal> in the <literal>lda</literal>
+          statement must be y.  It cannot be x (that's a different
+          form of indirection), and it cannot be a constant.  If
+          you're doing a lot of indirection, be sure to keep your Y
+          register free to handle the indexing on the
+          pointers.
+      </para>
+      </listitem>
+      <listitem>
+        <para>
+          The <literal>b</literal> variable is used alone.  Statements
+          like <literal>lda (b+2), y</literal> are syntactically valid
+          and sometimes even correct: it dereferences the value next
+          to <literal>b</literal> after adding y to the value therein.
+          However, it is almost guaranteed that what you *really*
+          wanted to do was compute <literal>*(b+2)</literal> (that is,
+          take the address of b, add 2 to <emphasis>that</emphasis>,
+          and dereference that value); see the next section for how to
+          do this properly.
+        </para>
+      </listitem>
+    </itemizedlist>
+
+    <para>
+      In nearly all cases, it is the Y-register's version (Indirect
+      Indexed) that you want to use when you're dealing with pointers.
+      Even though either version could be used for this example, we
+      use the Y register to establish this habit.
+    </para>
+  </section>
+</section>
+<section>
+  <title>Pointer arithmetic</title>
+
+  <para>
+    Pointer arithmetic is an obscenely powerful and dangerous
+    technique.  However, it's the most straightforward way to deal
+    with enormous arrays, structs, indexable stacks, and nearly
+    everything you do in C.  (C has no native array or string types
+    primarily because it allows arbitrary pointer arithmetic, which is
+    strong enough to handle all of those without complaint and at
+    blazing speed.  It also allows for all kinds of buffer overrun
+    security holes, but let's face it, who's going to be cracking root
+    on your Apple II?)  There are a number of ways to implement this
+    on the 6502.  We'll deal with them in increasing order of design
+    complexity.
+  </para>
+
+  <section>
+    <title>The straightforward, slow way</title>
+
+    <para>
+      When computing a pointer value, you simply treat the pointer as
+      if it were a 16-bit integer.  Do all the math you need, then
+      when the time comes to dereference it, simply do a direct
+      dereference as above.  This is definitely doable, and it's not
+      difficult.  However, it is costly in both space and time.
+    </para>
+
+    <para>
+      When dealing with arbitrary indices large enough that they won't
+      fit in the Y register, or when creating values that you don't
+      intend to dereference (such as subtracting two pointers to find
+      the length of a string), this is also the only truly usable
+      technique.
+    </para>
+  </section>
+  <section>
+    <title>The clever fast way</title>
+
+    <para>
+      But wait, you say.  Often when we compute a value, at least one
+      of the operations is going to be an addition, and we're almost
+      certain to have that value be less than 256!  Surely we may save
+      ourselves an operation by loading that value into the Y register
+      and having the load operation itself perform the final
+      addition!
+    </para>
+
+    <para>
+      Very good.  This is the fastest technique, and sometimes it's
+      even the most readable.  These cases usually involve repeated
+      reading of various fields from a structure or record.  The base
+      pointer always points to the base of the structure (or the top
+      of the local variable list, or what have you) and the Y register
+      takes values that index into that structure.  This lets you keep
+      the pointer variable in memory largely static and requires no
+      explicit arithmetic instructions at all.
+    </para>
+
+    <para>
+      However, this technique is highly opaque and should always be
+      well documented, indicating exactly what you think you're
+      pointing at.  Then, when you get garbage results, you can
+      compare your comments and the resulting Y values with the actual
+      definition of the structure to see who's screwing up.
+    </para>
+
+    <para>
+      For a case where we still need to do arithmetic, consider the
+      classic case of needing to clear out a large chunk of memory.
+      The following code fills the 4KB of memory between $C000 and
+      $D000 with zeroes:
+    </para>
+
+<programlisting>
+        lda #$C0        ; Store #$C000 in mem (low byte first)
+        sta mem+1
+        lda #$00
+        sta mem
+        ldx #$04        ; x holds number of times to execute outer loop
+        tay             ; accumulator and y are both 0
+loop:   sta (mem), y
+        iny
+        bne loop        ; Inner loop ends when y wraps around to 0
+        inc mem+1       ; "Carry" from the iny to the core pointer
+        dex             ; Decrement outer loop count, quit if done
+        bne loop
+</programlisting>
+
+    <para>
+      Used carefully, proper use of the Y register can make your code
+      smaller, faster, <emphasis>and</emphasis> more readable.  Used
+      carelessly it can make your code an unreadable, unmaintainable
+      mess.  Use it wisely, and with care, and it will be your
+      greatest ally in writing flexible code.
+    </para>
+  </section>
+</section>
+<section>
+  <title>What about Indexed Indirect?</title>
+
+  <para>
+    This essay has concerned itself almost exclusively with the
+    Indirect Indexed&mdash;or (Indirect), Y&mdash;mode.  What about Indexed
+    Indirect&mdash;(Indirect, X)?  This is a <emphasis>much</emphasis>
+    less useful mode than the Y register's version.  While the Y
+    register indirection lets you implement pointers and arrays in
+    full generality, the X register is useful for pretty much only one
+    application: lookup tables for single byte values.
+  </para>
+
+  <para>
+    Even coming up with a motivating example for this is difficult,
+    but here goes.  Suppose you have multiple, widely disparate
+    sections of memory that you're watching for signals.  The
+    following routine takes a resource index in the accumulator and
+    returns the status byte for the corresponding resource.
+  </para>
+
+<programlisting>
+; This data is sitting on the zero page somewhere
+resource_status_table: .word resource0_status, resource1_status,
+                       .word resource2_status, resource3_status,
+                       ; etc. etc. etc.
+
+; This is the actual program code
+.text
+getstatus:
+        clc   ; Multiply argument by 2 before putting it in X, so that it
+        asl   ; produces a value that's properly word-indexed
+        tax
+        lda (resource_status_table, x)
+        rts
+</programlisting>
+
+  <para>
+    Why having a routine such as this is better than just having the
+    calling routine access resourceN_status itself as an absolute
+    memory load is left as an exercise for the reader.  That aside,
+    this code fragment does serve as a reminder that when indexing an
+    array of anything other than bytes, you must multiply your index
+    by the size of the objects you want to index.  C does this
+    automatically&mdash;assembler does not.  Stay sharp.
+  </para>
+</section>
+<section>
+  <title>Comparison with the other indexed forms</title>
+
+  <para>
+    Pointers are slow.  It sounds odd saying this, when C is the
+    fastest language around on modern machines precisely because of
+    its powerful and extensive use of pointers.  However, modern
+    architectures are designed to be optimized for C-style code (as an
+    example, the x86 architecture allows statements like <literal>mov
+    eax, [bs+bx+4*di]</literal> as a single instruction), while the
+    6502 is not.  An (Indirect, Y) operation can take up to 6 cycles
+    to complete just on its own, while the preparation of that command
+    costs additional time <emphasis>and</emphasis> scribbles over a
+    bunch of registers, meaning memory operations to save the values
+    and yet more time spent.  The simple code given at the beginning
+    of this essay&mdash;loading <literal>*b</literal> into the
+    accumulator&mdash;takes 7 cycles, not counting the 6 it takes to
+    load b with the appropriate value to begin with.  If b is known to
+    contain a specific value, we can write a single Absolute mode
+    instruction to load its value, which takes only 4 cycles and also
+    preserves the value in the Y register.  Clearly, Absolute mode
+    should be used whenever possible.
+  </para>
+
+  <para>
+    One might be tempted to use self-modifying code to solve this
+    problem.  This actually doesn't pay off near enough for the hassle
+    it generates; for self-modifying code, the address must be
+    generated, then stored in the instruction, and then the data must
+    be loaded.  Cost: 16 cycles for 2 immediate loads, 2 absolute
+    stores, and 1 absolute load.  For the straight pointer
+    dereference, we generate the address, store it in the pointer,
+    clear the index, then dereference that.  Cost: 17 cycles for 3
+    immediate loads, 2 zero page stores, and 1 indexed indirect load.
+    Furthermore, unlike in the self-modifying case, loops where simple
+    arithmetic is being continuously performed only require repeating
+    the final load instruction, which allows for much greater time
+    savings over an equivalent self-modifying loop.
+  </para>
+
+  <para>
+    (This point is also completely moot for NES programmers or anyone
+    else whose programs are sitting in ROM, because programs stored on
+    a ROM cannot modify themselves.)
+  </para>
+</section>
+<section>
+  <title>Conclusion</title>
+
+  <para>
+    That's pretty much it for pointers.  Though they tend to make
+    programs hairy, and learning how to properly deal with pointers is
+    what separates real C programmers from the novices, the basic
+    mechanics of them are not complex.  With pointers you can do
+    efficient passing of large structures, pass-by-reference,
+    complicated return values, and dynamic memory management&mdash;and
+    now these wondrous toys may be added to your assembler programs,
+    too (assuming you have that kind of space to play with).
+  </para>
+</section>
+</chapter>
--- a/doc/hll4.sgm
+++ b/doc/hll4.sgm
@ -0,0 +1,270 @@
+<chapter>
+<title>Functionals</title>
+
+<para>
+  This essay deals with indirect calls.  These are the core of an
+  enormous number of high level languages: LISP's closures, C's
+  function pointers, C++ and Java's virtual method calls, and some
+  implementations of the <literal>switch</literal> statement.
+</para>
+
+<para>
+  These techniques vary in complexity, and most will not be
+  appropriate for large-scale assembler projects.  Of them, however,
+  the Data-Directed approach is the most likely to lead to organized
+  and maintainable code.
+</para>
+
+<section>
+  <title>Function Pointers</title>
+
+  <para>
+    Because assembly language is totally untyped, function pointers
+    are the same as any other sixteen-bit integer.  This makes
+    representing them really quite easy; most assemblers should permit
+    routines to be declared simply by naming the routine as
+    a <literal>.word</literal> directly.
+  </para>
+
+  <para>
+    To actually invoke these methods, copy them to some sixteen-bit
+    location (say, <literal>target</literal>) and then invoking the
+    method is a simple matter of the using an indirect jump:
+    the <literal>JMP&nbsp;(target)</literal> instruction.
+  </para>
+
+  <para>
+    There's really only one subtlety here, and it's that the indirect
+    jump is an indirect <emphasis>jump</emphasis>, not an
+    indirect <emphasis>function call</emphasis>.  Thus, if some
+    function <literal>A</literal> makes in indirect jump to some
+    routine, when that routine returns, it returns to whoever
+    called <literal>A</literal>, not <literal>A</literal>
+    itself.
+  </para>
+
+  <para>
+    There are several ways of dealing with this, but only one correct
+    way, which is to structure your procedures so that any call
+    to <literal>JMP&nbsp;(xxxx)</literal> occurs at the very
+    end.
+  </para>
+</section>
+<section>
+  <title>A quick digression on how subroutines work</title>
+  <para>
+    Ordinarily, subroutines are called with <literal>JSR</literal> and
+    finished with <literal>RTS</literal>.  The <literal>JSR</literal>
+    instruction takes its own address, adds 2 to it, and pushes this
+    16-bit value on the stack, high byte first, then low byte (so that
+    the low byte will be popped off first).
+  </para>
+
+  <para>
+    But wait, you may object.  All <literal>JSR</literal> instructions
+    are three bytes long.  This <quote>return address</quote> is in
+    the middle of the instruction.  And you would be quite right;
+    the <literal>RTS</literal> instruction pops off the 16-bit
+    address, adds one to it, and <emphasis>then</emphasis> sets the
+    program counter to that value.
+  </para>
+
+  <para>
+    So it <emphasis>is</emphasis> possible to set up
+    a <quote><literal>JSR</literal> indirect</quote> kind of operation
+    by adding two to the indirect jump's address and then pushing that
+    value onto the stack before making the jump; however, you wouldn't
+    want to do this.  It takes six bytes and trashes your accumulator,
+    and you can get the same functionality with half the space and
+    with no register corruption by simply defining the indirect jump
+    to be a one-instruction routine and <literal>JSR</literal>-ing to
+    it directly.  As an added bonus, that way if you have multiple
+    indirect jumps through the same pointer, you don't need to
+    duplicate the jump instruction.
+  </para>
+
+  <para>
+    Does this mean that abusing <literal>JSR</literal>
+    and <literal>RTS</literal> is a dead-end, though?  Not at all...
+  </para>
+</section>
+<section>
+  <title>Dispatch-on-type and Data-Directed Assembler</title>
+
+  <para>
+    Most of the time, you care about function pointers because you've
+    arranged them in some kind of table.  You hand it an index
+    representing the type of your argument, or which method it is
+    you're calling, or some other determinator, and then you index
+    into an array of routines and execute the right one.
+  </para>
+
+  <para>
+    Writing a generic routine to do this is kind of a pain.  First you
+    have to pass a 16-bit pointer in, then you have to dereference it
+    to figure out where your table is, then you have to do an indexed
+    dereference on <emphasis>that</emphasis> to get the routine you
+    want to run, then you need to copy it out to somewhere fixed so
+    that you can write your jump instruction.  And making this
+    non-generic doesn't help a whole lot, since that only saves you
+    the first two steps, but now you have to write them out in every
+    single indexed jump instruction.  If only there were some way to
+    easily and quickly pass in a local pointer directly...
+  </para>
+
+  <para>
+    Something, say, like the <literal>JSR</literal> instruction, only not for
+    program code.
+  </para>
+
+  <para>
+    Or we could just use the <literal>JSR</literal> statement itself,
+    but only call this routine at the ends of other routines, much
+    like we were organizing for indirect jumps to begin with.  This
+    lets us set up routines that look like this:
+  </para>
+
+<programlisting>
+jump'table'alpha:
+    jsr do'jump'table
+    .word alpha'0, alpha'1, alpha'2
+</programlisting>
+
+  <para>
+    Where the <literal>alpha'x</literal> routines are the ones to be
+    called when the index has that value.  This leaves the
+    implementation of do'jump'table, which in this case uses the Y
+    register to hold the index:
+  </para>
+
+<programlisting>
+do'jump'table:
+    sta _scratch
+    pla
+    sta _jmpptr
+    pla
+    sta _jmpptr+1
+    tya
+    asl
+    tay
+    iny
+    lda (_jmpptr), y
+    sta _target
+    iny
+    lda (_jmpptr), y
+    sta _target+1
+    lda _scratch
+    jmp (_target)
+</programlisting>
+
+  <para>
+    The <literal>TYA:ASL:TAY:INY</literal> sequence can actually be
+    omitted if you don't mind having your Y indices be 1, 3, 5, 7, 9,
+    etc., instead of 0, 1, 2, 3, 4, etc.  Likewise, the instructions
+    dealing with <literal>_scratch</literal> can be omitted if you
+    don't mind trashing the accumulator.  Keeping the accumulator and
+    X register pristine for the target call comes in handy, though,
+    because it means we can pass in a pointer argument purely in
+    registers.  This will come in handy soon...
+  </para>
+</section>
+<section>
+  <title>VTables and Object-Oriented Assembler</title>
+
+  <para>
+    The usual technique for getting something that looks
+    object-oriented in non-object-oriented languages is to fill a
+    structure with function pointers, and have those functions take
+    the structure itself as an argument.  This works just fine in
+    assembler, of course (and doesn't really require anything more
+    than your traditional jump-indirects), but it's also possible to
+    use a lot of the standard optimizations that languages such as C++
+    provide.
+  </para>
+
+  <para>
+    The most important of these is the <emphasis>vtable</emphasis>.
+    Each object type has its own vtable, and it's a list of function
+    pointers for all the methods that type provides.  This is a space
+    savings over the traditional structs-with-function-pointers
+    approach because when you have many objects of the same class, you
+    only have to represent the vtable once.  So that all objects may
+    be treated identically, the vtable location is traditionally fixed
+    as being the first entry in the corresponding structure.
+  </para>
+
+  <para>
+    Virtual method invocation takes an object pointer (traditionally
+    called <literal>self</literal> or <literal>this</literal>) and a
+    method index and invokes the approprate method on that object.
+    Gee, where have we seen that before?
+  </para>
+
+<programlisting>
+sprite'vtable:
+    jsr do'jump'table
+    .word sprite'init, sprite'update, sprite'render
+</programlisting>
+
+  <para>
+    We mentioned before that vtables are generally the first entries
+    in objects.  We can play another nasty trick here, paying an
+    additional byte per object to have the vtable be not merely a
+    pointer to its vtable routine, but an actual jump instruction to
+    it.  (That is, if an object is at location X, then location X is
+    the byte value <literal>$4C</literal>,
+    representing <literal>JMP</literal>, location X+1 is the low byte
+    of the vtable, and location X+2 is the high byte of the vtable.)
+    Given that, our <literal>invokevirtual</literal> function becomes
+    very simple indeed:
+  </para>
+
+<programlisting>
+invokevirtual:
+    sta this
+    stx this+1
+    jmp (this)
+</programlisting>
+
+  <para>
+    Which, combined with all our previous work here, takes
+    the <literal>this</literal> pointer in <literal>.AX</literal> and
+    a method identifier in <literal>.Y</literal> and invokes that
+    method on that object.  Arguments besides <literal>this</literal>
+    need to be set up before the call
+    to <literal>invokevirtual</literal>, probably in some global
+    argument array somewhere as discussed back in <xref linkend="hll2">.
+  </para>
+</section>
+<section>
+  <title>A final reminder</title>
+
+  <para>
+    We've been talking about all these routines as if they could be
+    copy-pasted or hand-compiled from C++ or Java code.  This isn't
+    really the case, primarily because <quote>local variables</quote>
+    in your average assembler routines aren't really local, so
+    multiple calls to the same method will tend to trash the program
+    state.  And since a lot of the machinery described here shares a
+    lot of memory (in particular, every single method invocation
+    everywhere shares a <literal>this</literal>), attempting to shift
+    over standard OO code into this format is likely to fail
+    miserably.
+  </para>
+
+  <para>
+    You can get an awful lot of flexibility out of even just one layer
+    of method-calls, though, given a thoughtful
+    design. The <literal>do'jump'table</literal> routine, or one very
+    like it, was extremely common in NES games in the mid-1980s and
+    later, usually as the beginning of the frame-update loop.
+  </para>
+
+  <para>
+    If you find you really need multiple layers of method calls,
+    though, then you really are going to need a full-on program stack,
+    and that's going to be several kinds of mess.  That's the topic
+    for the final chapter.
+  </para>
+</section>
+</chapter>
--- a/doc/hll5.sgm
+++ b/doc/hll5.sgm
@ -0,0 +1,218 @@
+<chapter>
+<title>Call Stacks</title>
+
+<para>
+  All our previous work has been assuming FORTRAN-style calling
+  conventions.  In this, all procedure-local variables are actually
+  secretly globals.  This means that a function that calls itself will
+  end up stomping on its previous values, and everything will be
+  hideously scrambled.  Various workarounds for this are covered
+  in <xref linkend="hll2">.  Here, we solve the problem fully.
+</para>
+
+<section>
+  <title>Recursion</title>
+
+  <para>
+    A procedure in C or other similar languages declares a chunk of
+    storage that's unique to that invocation.  This chunk is just
+    large enough to hold the return address and all the local
+    variables, and is called the <emphasis>stack frame</emphasis>.
+    Stack frames are arranged on a <emphasis>call stack</emphasis>;
+    when a function is called, the stack grows with the new frame, and
+    when that function returns, its frame is destroyed.  Once the main
+    function returns, the stack is empty.
+  </para>
+
+  <para>
+    Most modern architectures are designed to let you implement
+    variable access like this directly, without touching the registers
+    at all.  The x86 architecture even dedicates a register to
+    function explicitly as the <emphasis>stack pointer</emphasis>, and
+    then one could read, say, the fifth 16-bit variable into the
+    register AX with the command <literal>MOV AX, [SP+10]</literal>.
+  </para>
+
+  <para>
+    As we saw in <xref linkend="hll3">, the 6502 isn't nearly as
+    convenient.  We'd need to keep the stack pointer somewhere on the
+    zero page, then load the Y register with 10, then load the
+    accumulator with an indexed-indirect call.  This is verbose, keeps
+    trashing our registers, and it's very, very slow.
+  </para>
+
+  <para>
+    So, in the spirit of programmers everywhere, we'll cheat.
+  </para>
+</section>
+<section>
+  <title>Our Goals</title>
+
+  <para>
+    The system we develop should have all of the following
+    characteristics.
+  </para>
+
+  <itemizedlist>
+    <listitem><para>It should be <emphasis>intuitive to program for</emphasis>.  The procedure bodies should be easily readable and writable by humans, even in assembler form.</para></listitem>
+    <listitem><para>It should be <emphasis>efficient</emphasis>.  Variable accesses are very common, so procedures shouldn't cost much to run.</para></listitem>
+    <listitem><para>It should allow <emphasis>multiple arity</emphasis> in both arguments and return values.  We won't require that an unlimited amount of information be passable, but it should allow more than the three bytes the registers give us.</para></listitem>
+    <listitem><para>It should permit <emphasis>tail call elimination</emphasis>, an optimization that will allow certain forms of recursion to actually not grow the stack.</para></listitem>
+  </itemizedlist>
+
+  <para>
+    Here is a system that meets all these properties.
+  </para>
+
+  <itemizedlist>
+    <listitem><para>Reserve two bytes of the zero page for a stack pointer.  At the beginning of the program, set it to the top of memory.</para></listitem>
+    <listitem><para>Divide the remainder of Zero Page into two parts:
+        <itemizedlist>
+          <listitem><para>The <emphasis>scratch space</emphasis>, which is where arguments and return values go, and which may be scrambled by any function call, and</para></listitem>
+          <listitem><para>The <emphasis>local area</emphasis>, which all functions must restore to their initial state once finished.</para></listitem>
+        </itemizedlist>
+    </para></listitem>
+    <listitem><para>Assign to each procedure a <emphasis>frame size</emphasis> S, which is a maximum size on the amount of the local area the procedure can use.  The procedure's variables will sit in the first S bytes of the local area.</para></listitem>
+    <listitem><para>Upon entering the procedure, push the first S bytes of the local area onto the stack; upon exit, pop hose S bytes back on top of the local area.</para></listitem>
+    <listitem><para>While the procedure is running, only touch the local area and the scratch space.</para></listitem>
+  </itemizedlist>
+
+  <para>This meets our design criteria neatly:</para>
+
+  <itemizedlist>
+    <listitem><para>It's as intuitive as such a system will get.  You have to call <literal>init'stack</literal> at the beginning, and you need to ensure that <literal>save'stack</literal> and <literal>restore'stack</literal> are called right.  The procedure's program text can pretend that it's just referring to its own variables, just like with the old style.  If a procedure doesn't call <emphasis>anyone</emphasis>, then it can just do all its work in the scratch space.</para></listitem>
+    <listitem><para>It's efficient; the inside of the procedure is likely to be faster and smaller than its FORTRAN-style counterpart, because all variable references are on the Zero Page.</para></listitem>
+    <listitem><para>Both arguments and return values can be as large as the scratch space.  It's not infinite, but it's probably good enough.</para></listitem>
+    <listitem><para>Tail call elimination is possible; just restore the stack before making the JMP to the tail call target.</para></listitem>
+  </itemizedlist>
+
+  <para>
+    The necessary support code is pretty straightforward.  The stack
+    modification routines take the size of the frame in the
+    accumulator, and while saving the local area, it copies over the
+    corresponding values from the scratch space.  (This is because
+    most functions will be wanting to keep their arguments around
+    across calls.)
+  </para>
+
+<programlisting>
+.scope
+; Stack routines
+.data zp
+.space _sp      $02
+.space _counter $01
+.space fun'args $10
+.space fun'vars $40
+
+.text
+init'stack:
+        lda     #$00
+        sta     _sp
+        lda     #$A0
+        sta     _sp+1
+        rts
+
+save'stack:
+        sta     _counter
+        sec
+        lda     _sp
+        sbc     _counter
+        sta     _sp
+        lda     _sp+1
+        sbc     #$00
+        sta     _sp+1
+        ldy     #$00
+*       lda     fun'vars, y
+        sta     (_sp), y
+        lda     fun'args, y
+        sta     fun'vars, y
+        iny
+        dec     _counter
+        bne -
+        rts
+
+restore'stack:
+        pha
+        sta     _counter
+        ldy     #$00
+*       lda     (_sp), y
+        sta     fun'vars, y
+        iny
+        dec     _counter
+        bne -
+        pla
+        clc
+        adc     _sp
+        sta     _sp
+        lda     _sp+1
+        adc     #$00
+        sta     _sp+1
+        rts
+.scend
+</programlisting>
+</section>
+<section>
+  <title>Example: Fibonnacci Numbers</title>
+
+  <para>
+    About the simplest <quote>interesting</quote> recursive function
+    is the Fibonacci numbers.  The function fib(x) is defined as being
+    1 if x is 0 or 1, and being fib(x-2)+fib(x-1) otherwise.
+  </para>
+
+  <para>
+    Actually expressing it like that directly produces a very
+    inefficient implementation, but it's a simple demonstration of the
+    system.  Here's code for expressing the fib function:
+  </para>
+
+<programlisting>
+.scope
+; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
+; fib(0) = fib(1) = 1.
+; Stack usage: 3.
+
+fib:    lda     #$03
+        jsr     save'stack
+        lda     fun'vars
+        cmp     #$02
+        bcc     _base
+
+        dec     fun'args
+        jsr     fib
+        lda     fun'args
+        sta     fun'vars+1
+        lda     fun'args+1
+        sta     fun'vars+2
+        lda     fun'vars
+        sec
+        sbc     #$02
+        sta     fun'args
+        jsr     fib
+        clc
+        lda     fun'args
+        adc     fun'vars+1
+        sta     fun'args
+        lda     fun'args+1
+        adc     fun'vars+2
+        sta     fun'args+1
+        jmp     _done
+
+_base:  ldy     #$01
+        sty     fun'args
+        dey
+        sty     fun'args+1
+
+_done:  lda     #$03
+        jsr     restore'stack
+        rts
+.scend
+</programlisting>
+
+  <para>
+    The full application, which deals with interfacing with CBM BASIC
+    and handles console I/O and such, is in <xref linkend="fib-src"
+    endterm="fib-fname">.
+  </para>
+</section>
+</chapter>
--- a/doc/ophismanual.sgm
+++ b/doc/ophismanual.sgm
@ -9,21 +9,60 @@
  <!ENTITY samplecode SYSTEM "samplecode.sgm">
  <!ENTITY pre1 SYSTEM "preface.sgm">
  <!ENTITY cmdref SYSTEM "cmdref.sgm">
+  <!ENTITY hll1 SYSTEM "hll1.sgm">
+  <!ENTITY hll2 SYSTEM "hll2.sgm">
+  <!ENTITY hll3 SYSTEM "hll3.sgm">
+  <!ENTITY hll4 SYSTEM "hll4.sgm">
+  <!ENTITY hll5 SYSTEM "hll5.sgm">
 ]>
 <book>
  <bookinfo>
    <title>Programming with Ophis</title>
    <author><firstname>Michael</firstname><surname>Martin</surname></author>
-    <copyright><year>2006-7</year><holder>Michael Martin</holder></copyright>
+    <copyright><year>2006-2012</year><holder>Michael Martin</holder></copyright>
  </bookinfo>
  &pre1;
-  &part1;
-  &part2;
-  &part3;
-  &part4;
-  &part5;
-  &part6;
-  &part7;
+  <part label="I">
+    <title>Using the Ophis Assembler</title>
+    <partintro>
+      <para>
+        The chapters in Part 1 are a tutorial guiding you through the
+        features and programming model of the Ophis assembler. It uses
+        the Commodore 64 as its target platform.
+      </para>
+      <para>
+        This is not a tutorial on 6502 assembly language; those are
+        available elsewhere.
+      </para>
+    </partintro>
+    &part1;
+    &part2;
+    &part3;
+    &part4;
+    &part5;
+    &part6;
+    &part7;
+  </part>
+  <part label="II">
+    <title>To HLL and Back</title>
+    <partintro>
+      <para>
+        This is a compilation of an essay series I wrote from
+        2002-2005 explaining how to apply HLL constructs from
+        high-level languages in your assembly language projects.
+      </para>
+      <para>
+        The examples have been updated and modernized for Ophis 2, and
+        while the examples all target the Commodore 64, they are more
+        generally applicable.
+      </para>
+    </partintro>
+    &hll1;
+    &hll2;
+    &hll3;
+    &hll4;
+    &hll5;
+  </part>
  &samplecode;
  &cmdref;
 </book>
--- a/doc/preface.sgm
+++ b/doc/preface.sgm
@ -1,6 +1,5 @@
 <preface>
  <title>Preface</title>
-
  <para>
    The Ophis project started on a lark back in 2001.  My graduate
    studies required me to learn Perl and Python, and I'd been playing
@ -8,42 +7,50 @@
    to learn both languages by writing a simple cross-assembler for
    the 6502 chip the C-64 used in both.
  </para>
-
  <para>
-    The Perl version was quickly abandoned, but the Python one slowly
-    grew in scope and power over the years, and by 2005 was a very
-    powerful, flexible macro assembler that saw more use than I'd
-    expect.  In 2007 I finally got around to implementing the last few
-    features I really wanted and polishing it up for general release.
+    The Perl one&mdash;uncreatively
+    dubbed <quote>Perl65</quote>&mdash;was quickly abandoned, but the
+    Python one saw more work. When it came time to name it, one of the
+    things I had been hoping to do with the assembler was to produce
+    working Apple II programs.  <quote>Ophis</quote> is Greek
+    for <quote>snake</quote>, and a number of traditions also use it
+    as the actual <emphasis>name</emphasis> of the serpent in the
+    Garden of Eden.  So, Pythons, snakes, and stories involving really
+    old Apples all combined to name the
+    assembler.<footnote><para>Ironically, cross-platform development
+    for the Apple II is extremely difficult, and while Ophis has been
+    very successfully used to develop code for the Commodore 64,
+    Nintendo Entertainment System, and Atari 2600, it has yet to
+    actually be deployed on any of the Apples which inspired its
+    name.</para></footnote>
  </para>
-
  <para>
-    Part of that process has been formatting the various little
-    tutorials and references I'd created into a single, unified
-    document&mdash;the one you are now reading.
+    Ophis slowly grew in scope and power over the years, and by 2005
+    was a very powerful, flexible macro assembler that saw more use
+    than I'd expect.  In 2007 Ophis 1.0 was formally released.
+    However, Ophis was written for Python 2.1 and this became more and
+    more untenable as time has gone by. As I started receiving patches
+    for parts of Ophis, and as I used it for some projects of my own,
+    it became clear that Ophis needed to be modernized and to become
+    better able to interoperate with other toolchains. It was this
+    process that led to Ophis 2.
+  </para>
+  <para>
+    This is an updated edition of <emphasis>Programming With
+    Ophis</emphasis>, including documentation for all new features
+    introduced and expanding the examples to include simple
+    demonstration programs for platforms besides the Commodore 64. It
+    also includes updated versions of the <emphasis>To HLL and
+    Back</emphasis> essays I wrote using Ophis and Perl65 as example
+    languages.
  </para>
-
-  <section>
-    <title>Why <quote>Ophis</quote>?</title>
-    <para>
-      It's actually a kind of a horrific pun.  See, I was using Python
-      at the time, and one of the things I had been hoping to do with
-      the assembler was to produce working Apple II
-      programs.  <quote>Ophis</quote> is Greek
-      for <quote>snake</quote>, and a number of traditions also use it
-      as the actual <emphasis>name</emphasis> of the serpent in the
-      Garden of Eden.  So, Pythons, snakes, and stories involving
-      really old Apples all combined to name the assembler.
-    </para>
-  </section>

  <section>
    <title>Getting a copy of Ophis</title>
    <para>
-      If you're reading this as part of the Ophis install, you clearly
-      already have it.  If not, as of this writing the homepage for
-      the Ophis assembler
-      is <ulink url="http://hkn.eecs.berkeley.edu/~mcmartin/ophis/"></ulink>. If
+      As of this writing, the Ophis assembler is hosted at Github. The
+      latest downloads and documentation will be available
+      at <ulink url="http://github.com/michaelcmartin/Ophis"></ulink>. If
      this is out-of-date, a Web search on <quote>Ophis 6502
      assembler</quote> (without the quotation marks) should yield its
      page.
@ -58,17 +65,30 @@
      somewhere in your path.
    </para>
    <para>
-      Windows users that have Python installed can use the same source
-      distributions that the other operating systems
-      use; <command>ophis.bat</command> will arrange the environment
-      variables accordingly and invoke the main script.
+      For Windows users, a prepackaged system made
+      with <command>py2exe</command> is also available.  The default
+      Windows installer will use this.  In this case, all you need to
+      do is have <command>ophis.exe</command> in your path.
+    </para>
+  </section>
+
+  <section>
+    <title>About the examples</title>
+    <para>
+      Versions of the examples in this book are available from the Ophis site. Windows users will find them packaged with the distribution; all other users can get them as a separate download or pull them directly from github.
    </para>
    <para>
-      If you are on Windows and do not have Python installed, a
-      prepackaged system made with <command>py2exe</command> is also
-      available.  The default Windows installer will use this.  In
-      this case, all you need to do is
-      have <command>ophis.exe</command> in your path.
+      The code in this book is available in
+      the <literal>examples/</literal> subdirectory, while extra
+      examples will be in subdirectories of their own with brief
+      descriptions.
+    </para>
+    <para>
+      Most examples will require use of <emphasis>platform
+      headers</emphasis>&mdash;standardized header files that set
+      useful constants for the target system and, if needed, contain
+      small programs to allow the program to be loaded and run. These
+      are stored in the <literal>platform/</literal> subdirectory.
    </para>
  </section>
 </preface>
--- a/doc/samplecode.sgm
+++ b/doc/samplecode.sgm
@ -5,10 +5,11 @@
    of this manual.
  </para>
  <section id="tutor1-src">
-    <title id="tutor1-fname"><filename>tutor1.oph</filename></title>
+    <title id="tutor1-fname"><filename>hello1.oph</filename></title>
 <programlisting>
 .word $0801
 .org  $0801
+.outfile "hello.prg"

        .word next, 10          ; Next line and current line number
        .byte $9e," 2064",0     ; SYS 2064
@ -28,10 +29,11 @@ hello:  .byte "HELLO, WORLD!", 0
 </programlisting>
  </section>
  <section id="tutor2-src">
-    <title id="tutor2-fname"><filename>tutor2.oph</filename></title>
+    <title id="tutor2-fname"><filename>hello2.oph</filename></title>
 <programlisting>
 .word $0801
 .org  $0801
+.outfile "hello.prg"

 .scope
        .word _next, 10         ; Next line and current line number
@ -68,85 +70,86 @@ _next:  .word 0                 ; End of program

 .advance 2064

-.require "kernal.oph"
+.require "../platform/c64kernal.oph"
 </programlisting>
  </section>
  <section id="kernal-src">
-    <title id="kernal-fname"><filename>kernal.oph</filename></title>
+    <title id="kernal-fname"><filename>c64kernal.oph</filename></title>
 <programlisting>
 ; KERNAL routine aliases (C64)

-.alias acptr  $ffa5
-.alias chkin  $ffc6
-.alias chkout $ffc9
-.alias chrin  $ffcf
-.alias chrout $ffd2
-.alias ciout  $ffa8
-.alias cint   $ff81
-.alias clall  $ffe7
-.alias close  $ffc3
-.alias clrchn $ffcc
-.alias getin  $ffe4
-.alias iobase $fff3
-.alias ioinit $ff84
-.alias listen $ffb1
-.alias load   $ffd5
-.alias membot $ff9c
-.alias memtop $ff99
-.alias open   $ffc0
-.alias plot   $fff0
-.alias ramtas $ff87
-.alias rdtim  $ffde
-.alias readst $ffb7
-.alias restor $ff8a
-.alias save   $ffd8
-.alias scnkey $ff9f
-.alias screen $ffed
-.alias second $ff93
-.alias setlfs $ffba
-.alias setmsg $ff90
-.alias setnam $ffbd
-.alias settim $ffdb
-.alias settmo $ffa2
-.alias stop   $ffe1
-.alias talk   $ffb4
-.alias tksa   $ff96
-.alias udtim  $ffea
-.alias unlsn  $ffae
-.alias untlk  $ffab
-.alias vector $ff8d
+.alias  acptr           $ffa5
+.alias  chkin           $ffc6
+.alias  chkout          $ffc9
+.alias  chrin           $ffcf
+.alias  chrout          $ffd2
+.alias  ciout           $ffa8
+.alias  cint            $ff81
+.alias  clall           $ffe7
+.alias  close           $ffc3
+.alias  clrchn          $ffcc
+.alias  getin           $ffe4
+.alias  iobase          $fff3
+.alias  ioinit          $ff84
+.alias  listen          $ffb1
+.alias  load            $ffd5
+.alias  membot          $ff9c
+.alias  memtop          $ff99
+.alias  open            $ffc0
+.alias  plot            $fff0
+.alias  ramtas          $ff87
+.alias  rdtim           $ffde
+.alias  readst          $ffb7
+.alias  restor          $ff8a
+.alias  save            $ffd8
+.alias  scnkey          $ff9f
+.alias  screen          $ffed
+.alias  second          $ff93
+.alias  setlfs          $ffba
+.alias  setmsg          $ff90
+.alias  setnam          $ffbd
+.alias  settim          $ffdb
+.alias  settmo          $ffa2
+.alias  stop            $ffe1
+.alias  talk            $ffb4
+.alias  tksa            $ff96
+.alias  udtim           $ffea
+.alias  unlsn           $ffae
+.alias  untlk           $ffab
+.alias  vector          $ff8d

 ; Character codes for the colors.
-.alias color'0 144
-.alias color'1 5
-.alias color'2 28
-.alias color'3 159
-.alias color'4 156
-.alias color'5 30
-.alias color'6 31
-.alias color'7 158
-.alias color'8 129
-.alias color'9 149
-.alias color'10 150
-.alias color'11 151
-.alias color'12 152
-.alias color'13 153
-.alias color'14 154
-.alias color'15 155
+.alias  color'0         144
+.alias  color'1         5
+.alias  color'2         28
+.alias  color'3         159
+.alias  color'4         156
+.alias  color'5         30
+.alias  color'6         31
+.alias  color'7         158
+.alias  color'8         129
+.alias  color'9         149
+.alias  color'10        150
+.alias  color'11        151
+.alias  color'12        152
+.alias  color'13        153
+.alias  color'14        154
+.alias  color'15        155

 ; ...and reverse video
-.alias reverse'on 18
-.alias reverse'off 146
+.alias  reverse'on      18
+.alias  reverse'off     146

 ; ...and character set
-.alias upper'case 142
-.alias lower'case 14
+.alias  upper'case      142
+.alias  lower'case      14
 </programlisting>
  </section>
  <section id="tutor3-src">
-    <title id="tutor3-fname"><filename>tutor3.oph</filename></title>
+    <title id="tutor3-fname"><filename>hello3.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
        ldx #0
@ -194,9 +197,10 @@ target10: .byte "UNIVERSE", 0
 </programlisting>
  </section>
  <section id="tutor4a-src">
-    <title id="tutor4a-fname"><filename>tutor4a.oph</filename></title>
+    <title id="tutor4a-fname"><filename>hello4a.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
        ldx #0
@ -265,9 +269,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor4b-src">
-    <title id="tutor4b-fname"><filename>tutor4b.oph</filename></title>
+    <title id="tutor4b-fname"><filename>hello4b.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
        ldx #0
@ -338,9 +343,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor4c-src">
-    <title id="tutor4c-fname"><filename>tutor4c.oph</filename></title>
+    <title id="tutor4c-fname"><filename>hello4c.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
        ldx #0
@ -414,9 +420,10 @@ delay:  tax
 </programlisting>
  </section>
  <section id="tutor5-src">
-    <title id="tutor5-fname"><filename>tutor5.oph</filename></title>
+    <title id="tutor5-fname"><filename>hello5.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .data
 .org $C000
@ -494,9 +501,10 @@ delay:  sta _tmp        ; save argument (rdtim destroys it)
 </programlisting>
  </section>
  <section id="tutor6-src">
-    <title id="tutor6-fname"><filename>tutor6.oph</filename></title>
+    <title id="tutor6-fname"><filename>hello6.oph</filename></title>
 <programlisting>
 .include "c64-1.oph"
+.outfile "hello.prg"

 .data
 .org $C000
@ -601,43 +609,63 @@ _done:  rts
 </programlisting>
  </section>
  <section id="c64-2-src">
-    <title id="c64-2-fname"><filename>c64-2.oph</filename></title>
+    <title id="c64-2-fname"><filename>c64_0.oph</filename></title>
 <programlisting>
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Commodore 64 Basic Runtime File
+;;
+;; Include this at the TOP of your C64 program, and it will handle
+;; hiding away the BASIC ROM and data and restoring it at the end.
+;;
+;; You will have a contiguous block of RAM from $0800 to $CF81, and
+;; Zero Page access from $02 to $7F in the segment "zp".
+
 .word $0801
 .org  $0801

+; BASIC program that just calls our machine language code
 .scope
        .word _next, 10         ; Next line and current line number
-        .byte $9e," 2064",0     ; SYS 2064
+        .byte $9e," 2062",0     ; SYS 2062
 _next:  .word 0                 ; End of program
 .scend

-.advance $0810
-
-.require "kernal.oph"
-
-.data zp
+.data zp ; Zero Page memory segment.
 .org $0002

 .text

 .scope
-        ; Cache BASIC's zero page at top of available RAM.
-        ldx #$7E
-*       lda $01, x
-        sta $CF81, x
+        ; Cache BASIC zero page at top of available RAM
+        ldx     #$7E
+*       lda     $01, x
+        sta     $CF81, x
        dex
-        bne -
+        bne     -

-        jsr _main
+        ; Swap out the BASIC ROM for RAM
+        lda     $01
+        and     #$fe
+        ora     #$06
+        sta     $01

-        ; Restore BASIC's zero page and return control.
+        ; Run the real program
+        jsr     _main

-        ldx #$7E
-*       lda $CF81, x
-        sta $01, x
+        ; Restore BASIC ROM
+        lda     $01
+        ora     #$07
+        sta     $01
+
+        ; Restore BASIC zero page
+        ldx     #$7E
+*       lda     $CF81, x
+        sta     $01, x
        dex
-        bne -
+        bne     -
+
+        ; Back to BASIC
        rts

 _main:
@ -646,9 +674,11 @@ _main:
 </programlisting>
  </section>
  <section id="tutor7-src">
-    <title id="tutor7-fname"><filename>tutor7.oph</filename></title>
+    <title id="tutor7-fname"><filename>hello7.oph</filename></title>
 <programlisting>
-.include "c64-2.oph"
+.include "../platform/c64_0.oph"
+.require "../platform/c64kernal.oph"
+.outfile "hello.prg"

 .data
 .org $C000
@ -744,6 +774,461 @@ _done:  rts

 .data zp
 .checkpc $80
+</programlisting>
+  </section>
+  <section id="structure-src">
+    <title id="structure-fname"><filename>structuredemo.oph</filename></title>
+<programlisting>
+.include "../platform/c64_0.oph"
+.require "../platform/c64kernal.oph"
+.outfile "structuredemo.prg"
+
+        jsr print'unsorted
+        jsr insertion'sort
+        jsr print'list
+        rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Linked list data: head, next, lb, hb.
+; lb/hb: Low/high bytes of the data array.  These are immutable and
+;        kept with the program text.
+; head:  Array index of the first element in the list, or #$FF if the
+;        list is empty
+; next:  Array of successor indices.  If you've just read element X,
+;        the value of memory location next+X is the index of the
+;        next element.  If next is #$FF, you've reached the end of
+;        the list.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data
+.org    $C000
+.space  head    1
+.space  next    16
+
+.text
+lb:   .byte &lt;$838,&lt;$618,&lt;$205,&lt;$984,&lt;$724,&lt;$301,&lt;$249,&lt;$946
+      .byte &lt;$925,&lt;$043,&lt;$114,&lt;$697,&lt;$985,&lt;$633,&lt;$312,&lt;$086
+hb:   .byte &gt;$838,&gt;$618,&gt;$205,&gt;$984,&gt;$724,&gt;$301,&gt;$249,&gt;$946
+      .byte &gt;$925,&gt;$043,&gt;$114,&gt;$697,&gt;$985,&gt;$633,&gt;$312,&gt;$086
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insertion'sort:  Sorts the list defined by head, next, hb, lb.
+; Arguments:  None.
+; Modifies:   All registers destroyed, head and next array sorted.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+insertion'sort:
+        lda #$FF        ; Clear list by storing the terminator in 'head'
+        sta head
+        ldx #$0         ; Loop through the lb/hb array, adding each
+insertion'sort'loop:    ; element one at a time
+        txa
+        pha
+        jsr insert_elt
+        pla
+        tax
+        inx
+        cpx #$10
+        bne insertion'sort'loop
+        rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insert_elt: Insert an element into the linked list.  Maintains the
+;             list in sorted, ascending order.  Used by
+;             insertion'sort.
+; Arguments:  X register holds the index of the element to add.
+; Modifies:   All registers destroyed; head and next arrays updated
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data
+.space lbtoinsert 1
+.space hbtoinsert 1
+.space indextoinsert 1
+
+.text
+
+insert_elt:
+        ldy head                        ; If the list is empty, make
+        cpy #$FF                        ; head point at it, and return.
+        bne insert_elt'list'not'empty
+        stx head
+        tya
+        sta next,x
+        rts
+insert_elt'list'not'empty:
+        lda lb,x                        ; Cache the data we're inserting
+        sta lbtoinsert
+        lda hb,x
+        sta hbtoinsert
+        stx indextoinsert
+        ldy head                        ; Compare the first value with
+        sec                             ; the data.  If the data must
+        lda lb,y                        ; be inserted at the front...
+        sbc lbtoinsert
+        lda hb,y
+        sbc hbtoinsert
+        bmi insert_elt'not'smallest
+        tya                             ; Set its next pointer to the
+        sta next,x                      ; old head, update the head
+        stx head                        ; pointer, and return.
+        rts
+insert_elt'not'smallest:
+        ldx head
+insert_elt'loop:                        ; At this point, we know that
+        lda next,x                      ; argument &gt; data[X].
+        tay
+        cpy #$FF                        ; if next[X] = #$FF, insert arg at end.
+        beq insert_elt'insert'after'current
+        lda lb,y                        ; Otherwise, compare arg to
+        sec                             ; data[next[X]].  If we insert
+        sbc lbtoinsert                  ; before that...
+        lda hb,y
+        sbc hbtoinsert
+        bmi insert_elt'goto'next
+insert_elt'insert'after'current:        ; Fix up all the next links
+        tya
+        ldy indextoinsert
+        sta next,y
+        tya
+        sta next,x
+        rts                             ; and return.
+insert_elt'goto'next:                   ; Otherwise, let X = next[X]
+        tya                             ; and go looping again.
+        tax
+        jmp insert_elt'loop
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'unsorted: Steps through the data array and prints each value.
+; Standalone procedure.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'unsorted:
+        lda #&lt;unsorted'hdr
+        ldx #&gt;unsorted'hdr
+        jsr put'string
+        ldy #$00
+print'unsorted'loop:
+        lda hb, Y
+        jsr print'hex
+        lda lb, y
+        jsr print'hex
+        lda #$20
+        jsr chrout
+        iny
+        cpy #$10
+        bne print'unsorted'loop
+        lda #$0D
+        jsr chrout
+        rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'list: Starts at head, and prints out every value in the
+;             linked list.
+; Standalone procedure.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'list:
+        lda #&lt;sorted'hdr
+        ldx #&gt;sorted'hdr
+        jsr put'string
+        ldy head
+print'list'loop:
+        cpy #$FF
+        beq print'list'done
+        lda hb, y
+        jsr print'hex
+        lda lb, y
+        jsr print'hex
+        lda #$20
+        jsr chrout
+        lda next, Y
+        tay
+        jmp print'list'loop
+print'list'done:
+        lda #$0d
+        jsr chrout
+        rts
+
+;; String data for the above routines.
+
+unsorted'hdr:
+        .byte 147               ; Clear screen first!
+        .byte "UNSORTED DATA:",13,0
+
+sorted'hdr:
+        .byte "SORTED DATA:",13,0
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'hex: outputs a two-character hex representation of a one-
+;            byte value.
+; Arguments: Byte to print in accumulator
+; Modifies: .A and .X
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'hex:
+        pha
+        clc
+        lsr
+        lsr
+        lsr
+        lsr
+        tax
+        lda hexstr,x
+        jsr chrout
+        pla
+        and #$0F
+        tax
+        lda hexstr,X
+        jsr chrout
+        rts
+
+; Character data array for print'hex.
+hexstr: .byte "0123456789ABCDEF"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; put'string: outputs a C-style null terminated string with length
+;             less than 256 to the screen.  If 256 bytes are written
+;             without finding a terminator, the routine ends quietly.
+; Arguments: Low byte of string address in .A, high byte in .X
+; Modifies: .A and .Y
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data zp
+.space put'string'addr 2
+
+.text
+put'string:
+        sta put'string'addr
+        stx put'string'addr+1
+        ldy #$00
+put'string'loop:
+        lda (put'string'addr),y
+        beq put'string'done
+        jsr chrout
+        iny
+        bne put'string'loop
+put'string'done:
+        rts
+
+</programlisting>
+  </section>
+  <section id="fib-src">
+    <title id="fib-fname"><filename>fibonacci.oph</filename></title>
+<programlisting>
+.include "../platform/c64_0.oph"
+.require "../platform/c64kernal.oph"
+.outfile "fibonacci.prg"
+
+        lda     #&lt;opening     ; Print opening text
+        sta     fun'args
+        lda     #&gt;opening
+        sta     fun'args+1
+        jsr     print'string
+
+        lda     #$00
+        sta     fun'vars      ; Count num from 0 to 19
+*       lda     fun'vars      ; Main loop: print num, with leading space if &lt;10
+        cmp     #$09
+        bcs     +
+        lda     #$20
+        jsr     chrout
+        lda     fun'vars
+*       sta     fun'args      ; Copy num to args, print it, plus ": "
+        inc     fun'args
+        lda     #$00
+        sta     fun'args+1
+        jsr     print'dec
+        lda     #$3A
+        jsr     chrout
+        lda     #$20
+        jsr     chrout
+        lda     fun'vars      ; Copy num to args, call fib, print result
+        sta     fun'args
+        jsr     fib
+        jsr     print'dec
+        lda     #$0D          ; Newline
+        jsr     chrout
+        inc     fun'vars      ; Increment num; if it's 20, we're done.
+        lda     fun'vars
+        cmp     #20
+        bne     --            ; Otherwise, loop.
+        rts
+
+opening:
+.byte   147, "           FIBONACCI SEQUENCE",13,13,0
+
+.scope
+; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
+; fib(0) = fib(1) = 1.
+; Stack usage: 3.
+
+fib:    lda     #$03
+        jsr     save'stack
+
+        lda     fun'vars    ; If x &lt; 2, goto _base.
+        cmp     #$02
+        bcc     _base
+
+        dec     fun'args    ; Otherwise, call fib(x-1)...
+        jsr     fib
+        lda     fun'args    ; Copy the result to local variable...
+        sta     fun'vars+1
+        lda     fun'args+1
+        sta     fun'vars+2
+        lda     fun'vars    ; Call fib(x-2)...
+        sec
+        sbc     #$02
+        sta     fun'args
+        jsr     fib
+        clc                 ; And add the old result to it, leaving it
+        lda     fun'args    ; in the 'result' location.
+        adc     fun'vars+1
+        sta     fun'args
+        lda     fun'args+1
+        adc     fun'vars+2
+        sta     fun'args+1
+        jmp     _done       ; and then we're done.
+
+_base:  ldy     #$01        ; In the base case, just copy 1 to the
+        sty     fun'args    ; result.
+        dey
+        sty     fun'args+1
+
+_done:  lda     #$03
+        jsr     restore'stack
+        rts
+.scend
+
+.scope
+; Stack routines: init'stack, save'stack, restore'stack
+.data zp
+.space _sp      $02
+.space _counter $01
+.space fun'args $10
+.space fun'vars $40
+
+.text
+init'stack:
+        lda     #$00
+        sta     _sp
+        lda     #$A0
+        sta     _sp+1
+        rts
+
+save'stack:
+        sta     _counter
+        sec
+        lda     _sp
+        sbc     _counter
+        sta     _sp
+        lda     _sp+1
+        sbc     #$00
+        sta     _sp+1
+        ldy     #$00
+*       lda     fun'vars, y
+        sta     (_sp), y
+        lda     fun'args, y
+        sta     fun'vars, y
+        iny
+        dec     _counter
+        bne -
+        rts
+
+restore'stack:
+        pha
+        sta     _counter
+        ldy     #$00
+*       lda     (_sp), y
+        sta     fun'vars, y
+        iny
+        dec     _counter
+        bne -
+        pla
+        clc
+        adc     _sp
+        sta     _sp
+        lda     _sp+1
+        adc     #$00
+        sta     _sp+1
+        rts
+.scend
+
+
+; Utility functions.  print'dec prints an unsigned 16-bit integer.
+; It's ugly and long, mainly because we don't bother with niceties
+; like "division".  print'string prints a zero-terminated string.
+
+.scope
+.data
+.org    fun'args
+        .space  _val            2
+        .space  _step           2
+        .space  _res            1
+        .space  _allowzero      1
+.text
+print'dec:
+        lda     #$00
+        sta     _allowzero
+        lda     #&lt;10000
+        sta     _step
+        lda     #&gt;10000
+        sta     _step+1
+        jsr     repsub'16
+        lda     #&lt;1000
+        sta     _step
+        lda     #&gt;1000
+        sta     _step+1
+        jsr     repsub'16
+        lda     #0
+        sta     _step+1
+        lda     #100
+        sta     _step
+        jsr     repsub'16
+        lda     #10
+        sta     _step
+        jsr     repsub'16
+        lda     _val
+        jsr     _print
+        rts
+
+repsub'16:
+        lda     #$00
+        sta     _res
+*       lda     _val
+        sec
+        sbc     _step
+        lda     _val+1
+        sbc     _step+1
+        bcc     _done
+        lda     _val
+        sec
+        sbc     _step
+        sta     _val
+        lda     _val+1
+        sbc     _step+1
+        sta     _val+1
+        inc     _res
+        jmp     -
+_done:  lda     _res
+        ora     _allowzero
+        beq     _ret
+        sta     _allowzero
+        lda     _res
+_print: clc
+        adc     #'0
+        jsr     chrout
+_ret:   rts
+.scend
+
+print'string:
+        ldy     #$00
+*       lda     (fun'args), y
+        beq     +
+        jsr     chrout
+        iny
+        jmp     -
+*       rts
 </programlisting>
  </section>
 </appendix>
--- a/doc/tutor1.sgm
+++ b/doc/tutor1.sgm
@ -256,6 +256,31 @@ hello:  .byte "HELLO, WORLD!", 0
       summary of available command line options.
    </para>

+    <para>
+      Ophis takes a list of source files and produces an output file
+      based on assembling each file you give it, in order. You can add
+      a line to your program like this to name the output file:
+    </para>
+
+<programlisting>
+.outfile "hello.prg"
+</programlisting>
+
+    <para>
+      Alternately, you can use the <option>-o</option> option on the
+      command line. This will override any <literal>.outfile</literal>
+      directives. If you don't specify any name, it will put the
+      output into a file named <filename>ophis.bin</filename>.
+    </para>
+
+    <para>
+      If you are using Ophis as part of some larger toolchain, you can
+      also make it run in <emphasis>pipe mode</emphasis>. If you give
+      a dash <option>-</option> as an input file or as the output
+      target, Ophis will use standard input or output for
+      communication.
+    </para>
+
    <table frame="all">
      <title>Ophis Options</title>
      <tgroup cols='2'>
@ -266,13 +291,11 @@ hello:  .byte "HELLO, WORLD!", 0
          </row>
        </thead>
        <tbody>
-          <row><entry><option>-6510</option></entry><entry>Allows the 6510 undocumented opcodes as listed in the VICE documentation.</entry></row>
-          <row><entry><option>-65c02</option></entry><entry>Allows opcodes and addressing modes added by the 65C02.</entry></row>
-          <row><entry><option>-v 0</option></entry><entry>Quiet operation.  Only reports errors.</entry></row>
-          <row><entry><option>-v 1</option></entry><entry>Default operation.  Reports files as they are loaded, and gives statistics on the final output.</entry></row>
-          <row><entry><option>-v 2</option></entry><entry>Verbose operation.  Names each assembler pass as it runs.</entry></row>
-          <row><entry><option>-v 3</option></entry><entry>Debug operation:  Dumps the entire IR after each pass.</entry></row>
-          <row><entry><option>-v 4</option></entry><entry>Full debug operation:  Dumps the entire IR and symbol table after each pass.</entry></row>
+          <row><entry><option>-o FILE</option></entry><entry>Overrides the default filename for output.</entry></row>
+          <row><entry><option>-u</option></entry><entry>Allows the 6510 undocumented opcodes as listed in the VICE documentation.</entry></row>
+          <row><entry><option>-c</option></entry><entry>Allows opcodes and addressing modes added by the 65C02.</entry></row>
+          <row><entry><option>-q</option></entry><entry>Quiet operation.  Only reports warnings and errors.</entry></row>
+          <row><entry><option>-v</option></entry><entry>Verbose operation.  Reports files as they are loaded.</entry></row>
        </tbody>
      </tgroup>
    </table>
@ -283,30 +306,16 @@ hello:  .byte "HELLO, WORLD!", 0
      here:
    </para>
    <screen>
-localhost$ ophis tutor1.oph tutor1.prg -v 2
-Loading tutor1.oph
-Running: Macro definition pass
-Running: Macro expansion pass
-Running: Label initialization pass
-Fixpoint failed, looping back
-Running: Label initialization pass
-Running: Circularity check pass
-Running: Expression checking pass
-Running: Easy addressing modes pass
-Running: Label Update Pass
-Fixpoint failed, looping back
-Running: Label Update Pass
-Running: Instruction Collapse Pass
-Running: Mode Normalization pass
-Running: Label Update Pass
-Running: Assembler
+localhost$ ophis -v hello1.oph
+Loading hello1.oph
 Assembly complete: 45 bytes output (14 code, 29 data, 2 filler)
    </screen>
    <para>
-      If your emulator can run <filename>PRG</filename> files
-      directly, this file will now run (and
-      print <computeroutput>HELLO, WORLD!</computeroutput>) as many
-      times as you type <userinput>RUN</userinput>.  Otherwise, use
+      This will produce a file named <filename>hello.prg</filename>. If
+      your emulator can run <filename>PRG</filename> files directly,
+      this file will now run (and print <computeroutput>HELLO,
+      WORLD!</computeroutput>) as many times as you
+      type <userinput>RUN</userinput>.  Otherwise, use
      a <filename>D64</filename> management utility to put
      the <filename>PRG</filename> on a <filename>D64</filename>, then
      load and run the file off that.
--- a/doc/tutor3.sgm
+++ b/doc/tutor3.sgm
@ -53,7 +53,10 @@
      the KERNAL values are standard, we do not reproduce them here.
      (The files in question are <xref linkend="c64-1-src"
      endterm="c64-1-fname"> and <xref linkend="kernal-src"
-      endterm="kernal-fname">.)
+      endterm="kernal-fname">.) The <filename>c64kernal.oph</filename>
+      header is likely to be useful in your own projects, and it is
+      available in the <literal>platform/</literal> directory for easy
+      inclusion.
    </para>
  </section>
  <section>
--- a/doc/tutor4.sgm
+++ b/doc/tutor4.sgm
@ -64,11 +64,11 @@ target10: .byte "Universe", 0
    and lowercase are reversed, so we have messages
    like <computeroutput>hELLO, sOLAR sYSTEM!</computeroutput>.  For
    the specific case of PETSCII, we can just fix our strings, but
-    that's less of an option if we're writing for the Apple II's
-    character set, or targeting a game console that puts its letters
-    in arbitrary locations.  We need to remap how strings are turned
-    into byte values.  The <literal>.charmap</literal>
-    and <literal>.charmapbin</literal> directives do what we need.
+    that's less of an option if we're writing for a game console that
+    puts its letters in arbitrary locations.  We need to remap how
+    strings are turned into byte values.
+    The <literal>.charmap</literal> and <literal>.charmapbin</literal>
+    directives do what we need.
  </para>

  <para>
@ -102,9 +102,6 @@ target10: .byte "Universe", 0
    specifies an external file, 256 bytes long, that is loaded in at
    that point.  A binary character map for the Commodore 64 is
    provided with the sample programs
-    as <filename>petscii.map</filename>. There are also three
-    files, <filename>a2normal.map</filename>, <filename>a2inverse.map</filename>,
-    and <filename>a2blink.map</filename> that handle the Apple II's
-    very nonstandard character encodings.
+    as <filename>petscii.map</filename>.
  </para>
 </chapter>
--- a/doc/tutor7.sgm
+++ b/doc/tutor7.sgm
@ -48,27 +48,46 @@
      locations $02-$7F are used by the BASIC interpreter, and
      locations $80-$FF are used by the KERNAL.  We don't need the
      BASIC interpreter, though, so we can back up all of $02-$7F at
-      the start of our program and restore it all when we're done:
+      the start of our program and restore it all when we're done.
+    </para>
+
+    <para>
+      In fact, since we're disablng BASIC, we can actually also swap
+      out its ROM entirely and get a contiguous block of RAM from
+      $0002 to $CFFF:
    </para>

 <programlisting>
 .scope
-        ; Cache BASIC's zero page at top of available RAM.
-        ldx #$7E
-*       lda $01, x
-        sta $CF81, x
+        ; Cache BASIC zero page at top of available RAM
+        ldx     #$7E
+*       lda     $01, x
+        sta     $CF81, x
        dex
-        bne -
+        bne     -

-        jsr _main
+        ; Swap out the BASIC ROM for RAM
+        lda     $01
+        and     #$fe
+        ora     #$06
+        sta     $01

-        ; Restore BASIC's zero page and return control.
+        ; Run the real program
+        jsr     _main

-        ldx #$7E
-*       lda $CF81, x
-        sta $01, x
+        ; Restore BASIC ROM
+        lda     $01
+        ora     #$07
+        sta     $01
+
+        ; Restore BASIC zero page
+        ldx     #$7E
+*       lda     $CF81, x
+        sta     $01, x
        dex
-        bne -
+        bne     -
+
+        ; Back to BASIC
        rts

 _main:
@ -79,7 +98,9 @@ _main:

    <para>
      The new, improved header file is <xref linkend="c64-2-src"
-      endterm="c64-2-fname">.
+      endterm="c64-2-fname">. This,
+      like <filename>c64kernal.oph</filename>, is available for use in
+      your own projects in the <literal>platform/</literal> directory.
    </para>

    <para>
--- a/examples/fibonacci.oph
+++ b/examples/fibonacci.oph
@ -0,0 +1,213 @@
+.include "../platform/c64_0.oph"
+.require "../platform/c64kernal.oph"
+.outfile "fibonacci.prg"
+
+	lda	#<opening     ; Print opening text
+	sta	fun'args
+	lda	#>opening
+	sta	fun'args+1
+	jsr	print'string
+
+	lda	#$00
+	sta	fun'vars      ; Count num from 0 to 19
+*	lda	fun'vars      ; Main loop: print num, with leading space if <10
+	cmp	#$09
+	bcs	+
+	lda	#$20
+	jsr	chrout
+	lda	fun'vars
+*	sta	fun'args      ; Copy num to args, print it, plus ": "
+	inc	fun'args
+	lda	#$00
+	sta	fun'args+1
+	jsr	print'dec
+	lda	#$3A
+	jsr	chrout
+	lda	#$20
+	jsr	chrout
+	lda	fun'vars      ; Copy num to args, call fib, print result
+	sta	fun'args
+	jsr	fib
+	jsr	print'dec
+	lda	#$0D          ; Newline
+	jsr	chrout
+	inc	fun'vars      ; Increment num; if it's 20, we're done.
+	lda	fun'vars
+	cmp	#20
+	bne	--            ; Otherwise, loop.
+	rts
+
+opening:
+.byte	147, "           FIBONACCI SEQUENCE",13,13,0
+
+.scope
+; Uint16 fib (Uint8 x): compute Xth fibonnaci number.
+; fib(0) = fib(1) = 1.
+; Stack usage: 3.
+
+fib:	lda	#$03
+	jsr	save'stack
+
+	lda	fun'vars    ; If x < 2, goto _base.
+	cmp	#$02
+	bcc	_base
+
+	dec	fun'args    ; Otherwise, call fib(x-1)...
+	jsr	fib
+	lda	fun'args    ; Copy the result to local variable...
+	sta	fun'vars+1
+	lda	fun'args+1
+	sta	fun'vars+2
+	lda	fun'vars    ; Call fib(x-2)...
+	sec
+	sbc	#$02
+	sta	fun'args
+	jsr	fib
+	clc                 ; And add the old result to it, leaving it
+	lda	fun'args    ; in the 'result' location.
+	adc	fun'vars+1
+	sta	fun'args
+	lda	fun'args+1
+	adc	fun'vars+2
+	sta	fun'args+1
+	jmp	_done       ; and then we're done.
+
+_base:	ldy	#$01        ; In the base case, just copy 1 to the
+	sty	fun'args    ; result.
+	dey
+	sty	fun'args+1
+
+_done:	lda	#$03
+	jsr	restore'stack
+	rts
+.scend
+
+.scope
+; Stack routines: init'stack, save'stack, restore'stack
+.data zp
+.space _sp 	$02
+.space _counter	$01
+.space fun'args $10
+.space fun'vars $40
+
+.text
+init'stack:
+	lda	#$00
+	sta	_sp
+	lda	#$A0
+	sta	_sp+1
+	rts
+
+save'stack:
+	sta	_counter
+	sec
+	lda	_sp
+	sbc	_counter
+	sta	_sp
+	lda	_sp+1
+	sbc	#$00
+	sta	_sp+1
+	ldy	#$00
+*	lda	fun'vars, y
+	sta	(_sp), y
+	lda	fun'args, y
+	sta	fun'vars, y
+	iny
+	dec	_counter
+	bne -
+	rts
+
+restore'stack:
+	pha
+	sta	_counter
+	ldy	#$00
+*	lda	(_sp), y
+	sta	fun'vars, y
+	iny
+	dec	_counter
+	bne -
+	pla
+	clc
+	adc	_sp
+	sta	_sp
+	lda	_sp+1
+	adc	#$00
+	sta	_sp+1
+	rts
+.scend
+
+
+; Utility functions.  print'dec prints an unsigned 16-bit integer.
+; It's ugly and long, mainly because we don't bother with niceties
+; like "division".  print'string prints a zero-terminated string.
+
+.scope
+.data
+.org 	fun'args
+	.space	_val		2
+	.space	_step		2
+	.space	_res		1
+	.space	_allowzero	1
+.text
+print'dec:
+	lda 	#$00
+	sta	_allowzero
+	lda	#<10000
+	sta	_step
+	lda 	#>10000
+	sta 	_step+1
+	jsr 	repsub'16
+	lda	#<1000
+	sta	_step
+	lda 	#>1000
+	sta 	_step+1
+	jsr 	repsub'16
+	lda	#0
+	sta	_step+1
+	lda 	#100
+	sta 	_step
+	jsr 	repsub'16
+	lda 	#10
+	sta 	_step
+	jsr 	repsub'16
+	lda 	_val
+	jsr 	_print
+	rts
+
+repsub'16:
+	lda	#$00
+	sta	_res
+*	lda	_val
+	sec
+	sbc	_step
+	lda	_val+1
+	sbc	_step+1
+	bcc	_done
+	lda	_val
+	sec
+	sbc	_step
+	sta	_val
+	lda	_val+1
+	sbc	_step+1
+	sta	_val+1
+	inc	_res
+	jmp	-
+_done:	lda	_res
+	ora	_allowzero
+	beq	_ret
+	sta	_allowzero
+	lda	_res
+_print:	clc
+	adc	#'0
+	jsr	chrout
+_ret:	rts
+.scend
+
+print'string:
+	ldy	#$00
+*	lda	(fun'args), y
+	beq	+
+	jsr	chrout
+	iny
+	jmp	-
+*	rts
--- a/examples/hello1.oph
+++ b/examples/hello1.oph
@ -1,5 +1,6 @@
 .word $0801
 .org  $0801
+.outfile "hello.prg"

 	.word next, 10		; Next line and current line number
 	.byte $9e," 2064",0	; SYS 2064
--- a/examples/hello2.oph
+++ b/examples/hello2.oph
@ -1,5 +1,6 @@
 .word $0801
 .org  $0801
+.outfile "hello.prg"

 .scope
 	.word _next, 10		; Next line and current line number
--- a/examples/hello3.oph
+++ b/examples/hello3.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
 	ldx #0
--- a/examples/hello4a.oph
+++ b/examples/hello4a.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
 	ldx #0
--- a/examples/hello4b.oph
+++ b/examples/hello4b.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
 	ldx #0
--- a/examples/hello4c.oph
+++ b/examples/hello4c.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .macro print
 	ldx #0
--- a/examples/hello5.oph
+++ b/examples/hello5.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .data
 .org $C000
--- a/examples/hello6.oph
+++ b/examples/hello6.oph
@ -1,4 +1,5 @@
 .include "c64-1.oph"
+.outfile "hello.prg"

 .data
 .org $C000
--- a/examples/structuredemo.oph
+++ b/examples/structuredemo.oph
@ -0,0 +1,232 @@
+.include "../platform/c64_0.oph"
+.require "../platform/c64kernal.oph"
+.outfile "structuredemo.prg"
+
+	jsr print'unsorted
+	jsr insertion'sort
+	jsr print'list
+	rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Linked list data: head, next, lb, hb.
+; lb/hb: Low/high bytes of the data array.  These are immutable and
+;        kept with the program text.
+; head:  Array index of the first element in the list, or #$FF if the
+;        list is empty
+; next:  Array of successor indices.  If you've just read element X,
+;        the value of memory location next+X is the index of the
+;        next element.  If next is #$FF, you've reached the end of
+;        the list.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data
+.org	$C000
+.space	head	1
+.space	next	16
+
+.text
+lb:   .byte <$838,<$618,<$205,<$984,<$724,<$301,<$249,<$946
+      .byte <$925,<$043,<$114,<$697,<$985,<$633,<$312,<$086
+hb:   .byte >$838,>$618,>$205,>$984,>$724,>$301,>$249,>$946
+      .byte >$925,>$043,>$114,>$697,>$985,>$633,>$312,>$086
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insertion'sort:  Sorts the list defined by head, next, hb, lb.
+; Arguments:  None.
+; Modifies:   All registers destroyed, head and next array sorted.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+insertion'sort:
+	lda #$FF	; Clear list by storing the terminator in 'head'
+	sta head
+	ldx #$0		; Loop through the lb/hb array, adding each
+insertion'sort'loop:    ; element one at a time
+	txa
+	pha
+	jsr insert_elt
+	pla
+	tax
+	inx
+	cpx #$10
+	bne insertion'sort'loop
+	rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; insert_elt: Insert an element into the linked list.  Maintains the
+;             list in sorted, ascending order.  Used by
+;             insertion'sort.
+; Arguments:  X register holds the index of the element to add.
+; Modifies:   All registers destroyed; head and next arrays updated
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data
+.space lbtoinsert 1
+.space hbtoinsert 1
+.space indextoinsert 1
+
+.text
+
+insert_elt:
+	ldy head			; If the list is empty, make
+	cpy #$FF			; head point at it, and return.
+	bne insert_elt'list'not'empty
+	stx head
+	tya
+	sta next,x
+	rts
+insert_elt'list'not'empty:
+	lda lb,x			; Cache the data we're inserting
+	sta lbtoinsert
+	lda hb,x
+	sta hbtoinsert
+	stx indextoinsert
+	ldy head			; Compare the first value with
+	sec				; the data.  If the data must
+	lda lb,y			; be inserted at the front...
+	sbc lbtoinsert
+	lda hb,y
+	sbc hbtoinsert
+	bmi insert_elt'not'smallest
+	tya				; Set its next pointer to the
+	sta next,x			; old head, update the head
+	stx head			; pointer, and return.
+	rts
+insert_elt'not'smallest:
+	ldx head
+insert_elt'loop:			; At this point, we know that
+	lda next,x			; argument > data[X].
+	tay
+	cpy #$FF			; if next[X] = #$FF, insert arg at end.
+	beq insert_elt'insert'after'current
+	lda lb,y			; Otherwise, compare arg to
+	sec				; data[next[X]].  If we insert
+	sbc lbtoinsert			; before that...
+	lda hb,y
+	sbc hbtoinsert
+	bmi insert_elt'goto'next
+insert_elt'insert'after'current:	; Fix up all the next links
+	tya
+	ldy indextoinsert
+	sta next,y
+	tya
+	sta next,x
+	rts				; and return.
+insert_elt'goto'next:			; Otherwise, let X = next[X]
+	tya				; and go looping again.
+	tax
+	jmp insert_elt'loop
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'unsorted: Steps through the data array and prints each value.
+; Standalone procedure.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'unsorted:
+	lda #<unsorted'hdr
+	ldx #>unsorted'hdr
+	jsr put'string
+	ldy #$00
+print'unsorted'loop:
+	lda hb, Y
+	jsr print'hex
+	lda lb, y
+	jsr print'hex
+	lda #$20
+	jsr chrout
+	iny
+	cpy #$10
+	bne print'unsorted'loop
+	lda #$0D
+	jsr chrout
+	rts
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'list: Starts at head, and prints out every value in the
+;             linked list.
+; Standalone procedure.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'list:
+	lda #<sorted'hdr
+	ldx #>sorted'hdr
+	jsr put'string
+	ldy head
+print'list'loop:
+	cpy #$FF
+	beq print'list'done
+	lda hb, y
+	jsr print'hex
+	lda lb, y
+	jsr print'hex
+	lda #$20
+	jsr chrout
+	lda next, Y
+	tay
+	jmp print'list'loop
+print'list'done:
+	lda #$0d
+	jsr chrout
+	rts
+
+;; String data for the above routines.
+
+unsorted'hdr:
+	.byte 147		; Clear screen first!
+	.byte "UNSORTED DATA:",13,0
+
+sorted'hdr:
+	.byte "SORTED DATA:",13,0
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; print'hex: outputs a two-character hex representation of a one-
+;            byte value.
+; Arguments: Byte to print in accumulator
+; Modifies: .A and .X
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+print'hex:
+	pha
+	clc
+	lsr
+	lsr
+	lsr
+	lsr
+	tax
+	lda hexstr,x
+	jsr chrout
+	pla
+	and #$0F
+	tax
+	lda hexstr,X
+	jsr chrout
+	rts
+
+; Character data array for print'hex.
+hexstr: .byte "0123456789ABCDEF"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; put'string: outputs a C-style null terminated string with length
+;             less than 256 to the screen.  If 256 bytes are written
+;             without finding a terminator, the routine ends quietly.
+; Arguments: Low byte of string address in .A, high byte in .X
+; Modifies: .A and .Y
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.data zp
+.space put'string'addr 2
+
+.text
+put'string:
+	sta put'string'addr
+	stx put'string'addr+1
+	ldy #$00
+put'string'loop:
+	lda (put'string'addr),y
+	beq put'string'done
+	jsr chrout
+	iny
+	bne put'string'loop
+put'string'done:
+	rts
+