From 56dd9574028918ba376da8d5deebb504e96a52a1 Mon Sep 17 00:00:00 2001
From: Andrew Makousky <mako0042@umn.edu>
Date: Sun, 27 Dec 2020 09:19:35 -0600
Subject: [PATCH] Mac 128k PAL signals mostly working, hooray!

Also start working on BBU video timing signals, and documentation
improvements.
---
 hardware/fpga/bbu/README.md        |  64 ++++----
 hardware/fpga/bbu/bbu.v            | 227 +++++++++++------------------
 hardware/fpga/bbu/mac128pal.v      | 190 +++++++++++++++++-------
 hardware/fpga/bbu/test_mac128pal.v |   9 +-
 4 files changed, 260 insertions(+), 230 deletions(-)

diff --git a/hardware/fpga/bbu/README.md b/hardware/fpga/bbu/README.md
index 94b372b..5ed1e23 100644
--- a/hardware/fpga/bbu/README.md
+++ b/hardware/fpga/bbu/README.md
@@ -79,11 +79,10 @@ a huge number of pins, its purpose can be summarized as follows.
 * Act as a DRAM controller.  Set the ROM/RAM control signals depending
   on the particular address requested, i.e. `*EN245`, `*ROMEN`,
   `*RAS`, `*CAS0L`, `*CAS0H`, `*CAS1L`, `*CAS1H`, `RAM R/*W`.
-  `*PMCYC` is apparently used to totally disable DRAM row and column
-  access strobes only during startup.  The F257 chips are used to
-  select separate address portions for the DRAM row and column access
-  strobes.  The LS245 chips are used to disable DRAM access during ROM
-  access.
+  `*PMCYC` enables the row/column address multiplexers.  The F257
+  chips are used to select separate address portions for the DRAM row
+  and column access strobes.  The LS245 chips are used to disable DRAM
+  access during ROM access.
 
   DRAM is accessed by sending the row access strobe first, the column
   access strobe second.
@@ -128,7 +127,24 @@ a huge number of pins, its purpose can be summarized as follows.
     2. To enable fast-page mode (FPM) for fetching two 16-bit words in
        sequence (one "longword").  This in turn reduces the BBU's
        memory access overhead and therefore increases the speed of CPU
-       memory accesses.
+       memory accesses.  Guide to the Macintosh family hardware, page
+       401.
+
+* Please note that the framebuffer scanning circuitry only refreshes
+  the DRAM rows controlled by RA0 through RA8.  RA9 is only accessed
+  by software.  That means if there is more than one megabyte of DRAM
+  installed, there must be a software routine to continuously scan a
+  contiguous 2KB buffer within the first 512KB of RAM (or an alternate
+  but equivalent strategy).  The is the same behavior as was used in
+  the Macintosh Plus and earlier.  Guide to the Macintosh family
+  hardware, page 194
+
+  This is a bit of a bummer, but even though I don't quite understand
+  how 4MB DRAM refresh works, maybe it will "just work" in the real
+  system.
+
+  PLEASE NOTE.  Macintosh SE/30 takes one access cycle every 15.6us
+  for DRAM refresh.
 
 * So, wow.  Here's a list of all possible Macintosh SE RAM
   configurations.
@@ -137,26 +153,13 @@ a huge number of pins, its purpose can be summarized as follows.
   (undocumented), 1MB, 2MB, 4MB.
 
 * Refresh the DRAM by periodically reading some arbitrary memory from
-  every available row.  Unlike the Apple II, the contiguous
-  organization of the screen, sound, and PWM disk speed buffers does
-  not allow for these periodic functions to double as automatic DRAM
+  every available row.  Similar to the Apple II, the framebuffer scan
+  doubles as a DRAM refresh.  Except that high RAM requires software
   refresh.  How does this need play together with the PDS card's
   ability to request priority access over `DTACK`?  Maybe the refresh
   circuitry still continues to function, but without driving DTACK for
   the duration that the PDS card requests driving the signal.
 
-  However, one interesting trick is that the address multiplexers are
-  configured to access alternating DRAM rows when reading consecutive
-  addresses rather than all coming from a single DRAM row.  I am not
-  sure of the motivation behind this, but it seems like it could have
-  been extended so that reading consecutive memory addresses would
-  provide automatic DRAM memory refresh, thus allowing the video
-  circuitry to double in this role without providing the drawbacks of
-  nonlinear video memory to software.
-
-  Unfortunately, this scheme also complicates reusing the same DRAM
-  row for performance improvements.
-
 * Scan the CRT by driving the primary digital control signals
   (`*VSYNC`, `*HSYNC`, `VIDOUT`).  Read directly from RAM buffers as
   required, and use `*DTACK` to prevent the CPU from accessing RAM at
@@ -201,6 +204,11 @@ The following I/O chips are connected to the BBU:
 Other chips that are connected to the BBU are mainly interfaced via
 only simple, single-pin interfaces.
 
+Please note that PDS cards can also access DRAM, not just the CPU.
+This is mainly a matter of bus arbitration, then as far s the BBU is
+concerned, PDS access to DRAM should appear identical to CPU access to
+DRAM.  Guide to the Macintosh family hardware, page 84.
+
 ----------
 
 ## More explanation on pin functions
@@ -268,11 +276,13 @@ only simple, single-pin interfaces.
   performance and lower memory access time.
 
 * `*PMCYC` is an output signal.  Its primary conceptual purpose is to
-  define "whose turn" it is to access DRAM, the CPU or the BBU?  This
-  could be as simple as a 1 MHz clock, since the CPU always takes a
-  multiple of 4 clock cycles at 8 MHz to access DRAM.  The symbol is
-  probably short for Processor Memory CYCle.  It only connects to the
-  PDS slot and the F257 chips.
+  define "whose turn" it is to access DRAM, the CPU or the BBU?  In
+  the Macintosh Plus, this was a simple 1 MHz clock, since the CPU
+  always takes a multiple of 4 clock cycles at 8 MHz to access DRAM.
+  But the Macintosh SE uses a more sophisticated pattern to give the
+  CPU as large of a time share as possible to access DRAM.  The symbol
+  is probably short for Processor Memory CYCle.  It only connects to
+  the PDS slot and the F257 chips.
 
 ----------
 
@@ -340,7 +350,7 @@ signals.
   itself and the CPU is simply instructed to wait additional cycles by
   holding the `*DTACK` signal deasserted.
 
-* Implement bank switching to allow access to more than 4 MB of RAM
+* Implement bank switching to allow access to more than 4MB of RAM
   without requiring a CPU that is capable of virtual memory.  The
   original MC68000 CPU in particular does not allow for
   exception-handling that repeats execution of a faulted instruction,
diff --git a/hardware/fpga/bbu/bbu.v b/hardware/fpga/bbu/bbu.v
index ced6f0d..5e75cb6 100644
--- a/hardware/fpga/bbu/bbu.v
+++ b/hardware/fpga/bbu/bbu.v
@@ -214,13 +214,11 @@ module bbu_master_ctrl
    // SCSI signals
    output wire n_scsi;
    input wire scsidrq;
-   output reg n_dack;
+   output wire n_dack;
    // PDS signals
    input wire n_extdtk;
    output reg n_earen; // ??? Purpose unknown.
 
-   // Note tristate inout ... 'bz for high impedance.  8'bz for wide.
-
    // Full DRAM address bus snooping?  I almost thought this was
    // required to implement some functions, but it turns out it isn't,
    // partial address bus snooping is good enough.  Nevertheless, I'll
@@ -230,11 +228,8 @@ module bbu_master_ctrl
    // Installed RAM size.
    wire [23:0] ramsz;
 
-   // TODO MOVE DOCUMENTATION: PLEASE NOTE, PDS cards can also access
-   // DRAM, not just the CPU.  This is mainly a matter of bus
-   // arbitration, then as far s the BBU is concerned, PDS access to
-   // DRAM should appear identical to CPU access to DRAM.  Guide to
-   // the Macintosh Family hardware, page 84.
+   wire n_dtack_peri; // `*DTACK` for peripherals
+   wire n_dtack_bbu; // Holds `*DTACK` high for BBU RAM accesses
 
    //////////////////////////////////////////////////
    // Pure combinatorial logic is defined first.
@@ -249,45 +244,17 @@ module bbu_master_ctrl
    // SCSI IRQ line attaches directly to `*IPL0`?
    assign n_ipl0 = ~n_ipl1 | n_viairq;
 
+   // Tri-state `*DTACK` when `*EXTDTK` is asserted.
+   assign n_dtack = (n_extdtk) ? (n_dtack_peri | n_dtack_bbu) : 'bz;
+
    //////////////////////////////////////////////////
    // Sub-modules are instantiated here.
 
    // The remainder of definitions are for sequential logic.
    always @(negedge n_res) begin
       // Initialize all output registers on RESET.
-      n_dack <= 1;
       n_earen <= 1;
    end
-
-   always @(posedge c16m) begin
-      if (n_res) begin
-	 // All high speed sequential logic goes here.
-      end
-   end
-
-   always @(posedge c8m) begin
-      if (n_res) begin
-	 // All CPU speed sequential logic goes here.
-      end
-   end
-
-   always @(posedge c3_7m) begin
-      if (n_res) begin
-	 // All peripheral speed sequential logic goes here.
-      end
-   end
-
-   always @(posedge c2m) begin
-      if (n_res) begin
-	 // Only DRAM operations go here.
-      end
-   end
-
-   always @(negedge c2m) begin
-      if (n_res) begin
-	 // Only DRAM operations go here.
-      end
-   end
 endmodule
 
 /*
@@ -353,7 +320,7 @@ Write down all my questions thus far about the BBU:
 
 // Clock divider module.  Generate the frequency-divided clock
 // signals.
-module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt);
+module clock_div (n_res, c16m, c8m, c3_7m, c2m_e);
    input wire n_res;
    input wire c16m;
    output reg c8m;
@@ -362,30 +329,6 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt);
    // This is just an I/O argument placeholder.  We still generate the
    // signal internally, though.
    input wire c2m_e;
-   output wire n_pmcyc;
-   // *PMCYC "pre-trigger": will the *PMCYC state be negated on the
-   // next cycle?
-   output wire pmcyc_pt;
-   // TODO FIXME: `*PMCYC` should not be a strict 1MHz clock, because
-   // during vertical blanking, all cycles (except for horizontal
-   // blanking sound cycles) are fair game for CPU use.  PLEASE NOTE:
-   // According to Guide to the Macintosh family hardware, page 194,
-   // the process of scanning the screen buffer also refreshes the
-   // DRAM.  But I don't quite understand how this works, wouldn't you
-   // need to access more addresses to refresh all the DRAM?  But,
-   // PLEASE NOTE.  Macintosh SE/30 takes one access cycle every
-   // 15.6us for DRAM refresh.
-
-   // So, what's the secret sauce of the Macintosh SE being more
-   // performant in memory access?  Guide to the Macintosh family
-   // hardware, page 401.  During the BBU memory access cycle time,
-   // unlike earlier models that would only read one word, the BBU
-   // reads two 16-bit words.  Yes, so it does do buffering!  This
-   // allows the CPU to have free access to the next two cycles.  So,
-   // the word is hard and strong now, `*PMCYC` is not a simple 1MHz
-   // clock, but has a much more complex timing circuit.  That equates
-   // to a 200% memory access speedup during screen scanning in the
-   // Macintosh SE compared to the Macintosh Plus.
 
    /* Inside Macintosh claims that the serial clock is 3.672 MHz.
       Clock multiplication (via PLL) and division can be used to
@@ -435,9 +378,7 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt);
    reg c2m;
    reg c1m;
 
-   assign pmcyc_pt = c16m_div16_cntr[7];
    // assign c2m_e = c2m;
-   assign n_pmcyc = c1m;
 
    always @(negedge n_res) begin
       // Initialize all output registers on RESET.
@@ -714,13 +655,13 @@ endmodule
    executes for an even number of clock cycles (divisible by 2), and
    there is no pipelining in these early CPUs.
 */
-module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq,
+module decode_devaddr (n_res, c16m, n_ramen, n_romen, n_scsi, scsidrq,
 		       n_dack, n_sccen, n_sccrd, n_iow, n_iwm, via_cs1,
 		       n_vpa, n_berr, n_as, a23_19, a9, n_extdtk,
 		       boot_overlay, r_n_w, reg_romen, reg_ram_w,
 		       n_dtack_peri);
    input wire n_res;
-   input wire clk;
+   input wire c16m;
    output wire n_ramen;
    output wire n_romen;
    output wire n_scsi;
@@ -744,7 +685,7 @@ module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq,
    // Has an address access to the regular *ROMEN zone occurred?  This
    // signal is used to disable the boot-time memory overlay.
    output wire reg_romen;
-   output wire n_dtack_peri; // *DTACK for peripherals
+   output wire n_dtack_peri; // `*DTACK` for peripherals
 
    wire reg_ram, reg_ram_r;
    wire scdma; // host requested performing a SCSI pseudo-DMA read/write
@@ -826,13 +767,13 @@ module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq,
    // `*DTACK` on high-impedance when `*EXTDTK` is asserted.
 
    // assign n_dtack_peri =
-   //   n_extdtk ? (n_as | ((n_vpa | scdma) & n_dack)) : 'bz;
+   //   (n_extdtk) ? (n_as | ((n_vpa | scdma) & n_dack)) : 'bz;
 
    always @(negedge n_res) begin
       berr_cntr <= 0;
    end
 
-   always @(posedge clk) begin
+   always @(posedge c16m) begin
       if (n_res) begin
 	 if (n_as)
 	   berr_cntr <= 0;
@@ -879,7 +820,7 @@ endmodule
 
 // Column address strobe decode logic.  Determine which column access
 // strobe line to assert based off of the installed RAM, high-order
-// CPU address lines, and *LDS/*UDS signals.
+// CPU address lines, and *UDS/*LDS signals.
 module dramctl_cas (n_cas, n_cas0h, n_cas0l, n_cas1h, n_cas1l,
 		    n_uds, n_lds, row2, mbram, s64kram,
 		    a17, a19, a21);
@@ -904,7 +845,6 @@ endmodule
 
 // RA7/RA9 selector logic.  Determine which CPU address pins should be
 // routed to these RAM address pins based off of the installed RAM.
-// TODO FIXME: This is incorrect in light of new knowledge.
 module dramctl_ra7_9 (ra7, ra9, cas_n_ras, row2, mbram, s64kram,
 		      a9, a17, a19, a20, a10);
    output wire ra7;
@@ -920,11 +860,11 @@ module dramctl_ra7_9 (ra7, ra9, cas_n_ras, row2, mbram, s64kram,
      = (s64kram) ? // 64K RAM SIMMs
        (~cas_n_ras) ? a9 : a10
        : // 256K RAM SIMMs and 1MB RAM SIMMs
-       (~cas_n_ras) ? a17 : a9
+       (~cas_n_ras) ? a9 : a17
    ;
    assign ra9
      = (mbram) ? // 1MB RAM SIMMs
-       (~cas_n_ras) ? a20 : a19
+       (~cas_n_ras) ? a19 : a20
        : // <1MB RAM SIMMs
        0 // RA9 is not used
    ;
@@ -1060,8 +1000,11 @@ module dramctl_cpu (n_res, clk, r_n_w, c2m,
    // At a higher level, it is used to determine whether it is the
    // CPU's turn to access RAM or the BBU's turn to access RAM.  The
    // CPU always takes a multiple of 4 clock cycles running at 8 MHz
-   // to access RAM.  This signal could possibly be just wired up to a
-   // 1 MHz clock.
+   // to access RAM.  In the Macintosh Plus, this signal was wired up
+   // to a 1 MHz clock, but the Macintosh SE uses a more sophisticated
+   // approach.
+   // TODO FIXME: Implement `*PMCYC` generation logic, comes from the
+   // video timers module.
    input wire n_pmcyc;
    // output reg n_pmcyc;
    output reg n_dtack;
@@ -1354,6 +1297,10 @@ endmodule
 // exercise because of Verilog silliness.  Actually, might as well
 // make two modules since that is all that is needed to start: one for
 // video, one for DRAM.
+
+// TODO FIXME: We must be able to support Fast Page Mode (FPM) for
+// video memory access too.  But we don't do this for the sound
+// buffer.
 module fetch_vid_addr (n_res, clk, n_as, a, vidreg, s64kram);
    input wire n_res;
    input wire clk;
@@ -1399,9 +1346,21 @@ module avtimers ();
    input wire n_res;
    input wire c16m;
 
+   input wire c8m;
+   input wire c4m;
+   input wire c2m;
+   input wire c1m;
+
+   input wire [23:0] vid_main_addr; // Address of main video buffer
+   input wire [23:0] vid_alt_addr;  // Address of alternate video buffer
+   input wire [23:0] snddsk_main_addr; // Address of main sound/disk buffer
+   // Address of alternate sound/disk buffer
+   input wire [23:0] snddsk_alt_addr;
+
    // Video signals
    input wire vidpg2;  // VIDPG2 signal
-   output reg vidout;  // VIDOUT signal
+   output wire vidout;  // VIDOUT signal
+   output wire n_hsync_pt; // *HSYNC pre-trigger
    output reg n_hsync; // *HSYNC signal
    output reg n_vsync; // *VSYNC signal
 
@@ -1423,12 +1382,10 @@ module avtimers ();
 
    // *HSYNC and *VSYNC counters are negative during blanking.
    reg [15:0] vidout_sreg;    // VIDOUT shift register
-   reg [4:0]  vidout_cntr;    // VIDOUT remaining counter
-   reg [9:0]  vid_hsync_cntr; // *HSYNC counter
-   reg [8:0]  vid_vsync_cntr; // *VSYNC counter
-
-   wire [23:0] vid_main_addr; // Address of main video buffer
-   wire [23:0] vid_alt_addr;  // Address of alternate video buffer
+   wire [4:0] c16m_cntr;      // 16 MHz sub-cycle counter
+   reg n_ldps;
+   reg slice_cntr;            // Used to alter carry propagation
+   reg [14:0] va;             // Video address counter
 
    // Sound and disk speed buffers are scanned 370 words per video
    // frame, and the size of both buffers together is 370 words.  Or,
@@ -1453,9 +1410,6 @@ module avtimers ();
 
    reg [15:0] snddsk_reg; // PCM sound sample and disk speed register
 
-   wire [23:0] snddsk_main_addr; // Address of main sound/disk buffer
-   wire [23:0] snddsk_alt_addr;  // Address of alternate sound/disk buffer
-
    // We must be careful that the sound circuitry does not attempt to
    // access RAM at the same time as the video circuitry.  Because the
    // phases are coherent, we can simply align the sound and disk
@@ -1478,26 +1432,62 @@ module avtimers ();
    // been used.  This is going to be a one-shot countdown timer for
    // generating a single pulse per byte.
 
+   // The current 16 MHz cycle # can easily be determined from our
+   // divided clock frequencies.
+   assign c16m_cntr = { c1m, c2m, c4m, c8m };
+   assign vidout = vidout_sreg[15];
+
    always @(negedge n_res) begin
       // Initialize all output registers on RESET.
 
-      vidout <= 0; n_hsync <= 1; n_vsync <= 1;
+      n_hsync <= 1; n_vsync <= 1;
 
       snd <= 0; pwm <= 0;
 
       // Initialize all internal registers on RESET.
       vidout_sreg <= 0;
-      vidout_cntr <= 0;
-      vid_hsync_cntr <= 0;
-      vid_vsync_cntr <= 0;
+      va <= 0;
       snddsk_reg <= 0;
    end
+
+   // N.B. Now this is tricky.  Our load pixel shifter is carefully
+   // timed to happen immediately after the last pixel is displayed
+   // and as soon as the next value is available from DRAM.  This
+   // means that we actually offset the horizontal blanking signal by
+   // a nominal amount in comparison to the video address counter
+   // increments to compensate.
+
+   // Okay, here's the trick with FPM fetches.  We still need to count
+   // by 16 on the video address so we can time the 16-bit sound load
+   // at the end of the cycle correctly, but we use a double-width
+   // video shift register and only trigger video load half as often.
+
+   always @(posedge c16m) begin
+      if (n_ldps) begin
+	 // Fill the least significant bit with logic one so that the
+	 // CRT beam is off during blanking.
+	vidout_sreg <= { vidout_sreg[14:0], 1'b1 };
+      end
+      else
+	vidout_sreg <= 0; // TODO load new value.
+
+      // Increment the video address on every 1 MHz clock cycle.
+      // However, on horizontal blanking, we slice the carry until the
+      // end of the interval.
+      if (c16m_cntr == 4'hf) begin
+	 // N.B.: Remember we are counting by 16-bit words.
+	 if (slice_cntr)
+	   va[4:0] <= va[4:0] + 2;
+	 else
+	   va <= va + 2;
+      end
+   end
 endmodule
 
 /* TODO: Summary of what is missing and left to implement: DRAM
    initialization pulses, DRAM refresh, detect 2.5MB of RAM and
    configure address buffers accordingly, video, disk, and audio
-   scanout, EXTDTK yielding.
+   scanout.
 
    Okay, so the VERDICT on DRAM initialization pulses.  We don't
    actually use these as we should, strictly speaking, but why does it
@@ -1512,57 +1502,4 @@ endmodule
    4MB RAM DRAM refresh.  Then we need to do the busywork to implement
    the PWM and video scanout modules and we're done!  */
 
-/*
-
-Now I think I see why there is the funny thing going on with the
-address multiplexers for RAS/CAS.  It is a required modification to
-use DRAM fast-page mode since RAS and CAS are still logically
-"swapped" compared to a contiguous memory layout.  This swapping of
-RAS and CAS is used to get DRAM refresh for free when scanning the
-video framebuffer.
-
-Okay, so let's review in more detail.
-
-Address multiplexer row address outputs:
-
-A2, A3, A4, A5, A6, A7, A8, A10
-
-A9 inputs directly to BBU, controls RA7.
-
-This is a straight match-up to DRAM row address lines.
-
-RA0 A2
-RA1 A3
-RA2 A4
-RA3 A5
-RA4 A6
-RA5 A7
-RA6 A8
-RA7 A9
-RA8 A10
-RA9 A19 (optional) (!)
-
-So, how many longwords for the video framebuffer?
-
-512 x 342 / 32 = 5472 longwords
-In hex: 0x1560
-Number of address bits fully covered by a full scan: 12
-
-Okay, so the question, does it work for DRAM refresh?  Indeed it does!
-Well, the number of longwords swept is great enough to cover all DRAM
-rows for 4MB of RAM, but the address bit mapping appears only to work
-for <=1MB of RAM.
-
-Please note that since we use only a single row access strobe signal
-for both DRAM rows and instead use separate column access strobes to
-differentiate between the rows, even if all the video memory addresses
-are only in one row, we still refresh the other row as long as we
-cover all the row addresses.
-
-RA9 looks to be trouble.  But, the Unitron reverse engineering docs
-almost have a solution.  Set this to A17 (?) and it should "just work"
-I guess.  But why?
-
-*/
-
 `endif // NOT BBU_V
diff --git a/hardware/fpga/bbu/mac128pal.v b/hardware/fpga/bbu/mac128pal.v
index 1fb34e5..84f6726 100644
--- a/hardware/fpga/bbu/mac128pal.v
+++ b/hardware/fpga/bbu/mac128pal.v
@@ -82,21 +82,21 @@ module tsm(simclk, n_res,
    end
 
    // Simulate registered logic.
-   always @(negedge clk) begin
+   always @(posedge clk) begin
       if (n_res) begin
-      ras <= @(posedge clk)
+      ras <=
 	~(~pclk & q1 & s1 // video cycle
 	  | ~pclk & q1 & ~ramen & dtack // processor cycle
 	  | pclk & ~ras); // any other cycle
-      vclk <= @(posedge clk)
+      vclk <=
 	~(~q1 & pclk & q2 & vclk // divide by 8 (1MHz)
 	  | ~vclk & q1
 	  | ~vclk & ~pclk
 	  | ~vclk & ~q2);
-      q1 <= @(posedge clk)
+      q1 <=
 	~(~pclk & q1
 	  | pclk & ~q1); // divide `pclk` by 2 (4MHz)
-      q2 <= @(posedge clk)
+      q2 <=
 	~(~q1 & pclk & q2 // divide by 4 (2MHz)
 	  | ~q2 & q1
 	  | ~q2 & ~pclk);
@@ -136,63 +136,134 @@ module lag(simclk, n_res,
    end
 
    // Simulate registered logic.
-   always @(negedge sysclk) begin
+   always @(posedge sysclk) begin
       if (n_res) begin
-      vshft <= @(posedge sysclk)
+      vshft <=
 	~(s1 & ~vclk & snddma); // one pulse on the falling edge of `vclk`
-      vsync <= @(posedge sysclk)
+      vsync <=
 	~(reslin
 	  | ~vsync & ~l28);
-      // hsync  <= @(posedge sysclk)
+      // hsync  <=
       // 	~(viapb6 & va4 & ~va3 & ~va2 & va1 // begins in 29 (VA5)
       // 	  | /*~ ???*/resnyb
       // 	  | ~hsync & viapb6); // ends in 0F
-      hsync  <= @(posedge sysclk)
-	~(~viapb6 & ~va4 & ~va3 & va2 & va1 // begins in 29 (VA5)
+      // hsync  <=
+      // 	~(~viapb6 & ~va4 & ~va3 & va2 & va1 // begins in 29 (VA5)
+      // 	  | ~hsync & ~va4
+      // 	  | ~hsync & ~viapb6); // ends in 0F
+      // TODO FIXME: This is incorrect, temporary equations in order
+      // to get at least partial behavior for analysis.
+      // TODO FIXME: We trigger hsync a bit too soon at the end of the
+      // scanline.  And, we release it too soon at the beginning.
+      hsync <=
+	~(viapb6 & ~p0q2 & s1 & ~va1 & ~va2 & ~va3 & ~va4
 	  | ~hsync & ~va4
-	  | ~hsync & ~viapb6); // ends in 0F
-      s1 <= @(posedge sysclk)
+	  | ~hsync & ~va3
+	  | ~hsync & va2
+	  | ~hsync & va1);
+      // TODO FIXME: This is not a synthesizable PAL equation.
+      // TODO FIXME: This isn't generating sound buffer accesses
+      // during vertical blanking but it should be.
+      s1 <=
 	~(~p0q2 // 0 for processor and 1 for video
 	  | ~vclk
 	  | ~vsync & hsync
-	  | ~vsync & viapb6 // only in vertical retrace we have sound cycles
-	  | ~viapb6 & hsync & ~va4 & ~va3 & ~va2
+	  | ~vsync & viapb6 // vertical retrace only has sound cycles
+	  /* TODO INVESTIGATE: Line disabled because it drops out
+	     pulses we'd normally expect.  */
+	  /* | ~viapb6 & hsync & ~va4 & ~va3 & ~va2 */
 	  | ~viapb6 & ~hsync & (~va4 | va4 & ~va3 & ~va2 |
 				    va4 & ~va3 & va2 & ~va1));
-      // viapb6 <= @(posedge sysclk)
+      // viapb6 <=
       // 	~(~hsync & resnyb // 1 indicates horizontal retrace (pseudo VA6)
       // 	  | va1 & ~viapb6
       // 	  | va2 & ~viapb6
       // 	  | ~hsync & ~viapb6
       // 	  | resnyb & ~viapb6
       // 	  | vshft & ~viapb6);
-      viapb6 <= @(posedge sysclk)
-	~(hsync & ~va4 & ~va3 & va2 & va1 // 1 indicates horizontal retrace (pseudo VA6)
-	  | ~viapb6 & snddma
-	  | ~viapb6 & vclk);
-      snddma <= @(posedge sysclk)
-	~(viapb6 & va4 & ~va3 & va2 & va1 & p0q2 & vclk & ~hsync // 0 in this output
+      // viapb6 <=
+      // 	~(hsync & ~va4 & ~va3 & va2 & va1 // 1 indicates horizontal retrace (pseudo VA6)
+      // 	  | ~viapb6 & snddma
+      // 	  | ~viapb6 & vclk);
+      // TODO FIXME: This is incorrect, temporary equations in order
+      // to get at least partial behavior for analysis.
+      viapb6 <=
+	~(~hsync // 1 indicates horizontal retrace (pseudo VA6)
+	  | ~viapb6 & p0q2
+	  | ~viapb6 & ~s1
+	  | ~viapb6 & va1
+	  | ~viapb6 & va2
+	  | ~viapb6 & va3
+	  | ~viapb6 & va4);
+      // TODO FIXME HACK: Previously viapb6 but negated for testing.
+      snddma <=
+	~(~viapb6 & va4 & ~va3 & va2 & va1 & p0q2 & vclk & ~hsync // 0 in this output
 	  | ~snddma & vclk); // ... indicates sound cycle
-      reslin <= @(posedge sysclk) // try to generate line 370
+      // TODO FIXME HACK: Previously ~viapb6 but negated for testing.
+      reslin <= // try to generate line 370
 	~(l28
 	  | ~vsync
 	  | hsync
-	  | ~viapb6
-	  | ~vclk);      
-      resnyb <= @(posedge sysclk)
-      	~(vclk // increment VA5:VA14 in 0F and 2B
-      	  | viapb6 // ???
-      	  | va1
-      	  | va2
-      	  | ~viapb6 & va3
-      	  | hsync
-      	  | viapb6 & ~va3
-      	  | ~hsync & va3 & ~va4
-      	  | ~hsync & ~va3 & va4);
+	  | viapb6
+	  | ~vclk);
+      // N.B. Primary conceptual equation:
+      // resnyb <=
+      //   ((~viapb6 & hsync & ~va4 & ~va3 & ~va2 & ~va1)
+      //    | (~viapb6 & ~hsync & va4 & va3 & ~va2 & ~va1));
+      // TODO FIXME HACK: Possibly incorrect interpretation of viapb6
+      // with hsync.
+      resnyb <=
+	~(vclk // increment VA5:VA14 in 0F and 2B
+	  | viapb6
+	  | va1
+	  | va2
+	  | hsync & va4
+	  | hsync & va3
+	  | ~hsync & ~va4
+	  | ~hsync & ~va3
+	  | ~va4 & va3
+	  | va4 & ~va3);
       end
    end
 endmodule
 
+/*
+This LAG doesn't work correctly, here's how it is supposed to work.
+
+1. Count video addresses to 32.  During this count, generate resnyb
+   every time we need a carry.
+
+2. Once we reach 32, that's 512 pixels, one scanline.  Now, we assert
+   the *HSYNC signal.  But please note, at this point we do **not**
+   generate resnyb for the carry at 32, instead we let that counter
+   wrap around to zero without a carry.
+
+3. When the *HSYNC signal is asserted, we only count to 12.  That's
+   192 pixels for horizontal blanking.  Just when we're about to reach
+   the end, we assert the *SNDDMA signal.  That's when we fetch the
+   sound sample, at the very end of horizontal blanking, not the very
+   beginning.
+
+   Finally, we assert resnyb to clear the counter and finally
+   propagate the carry to the next video address.
+
+At the very end of vertical blanking, we assert *RESLIN to clear the
+video address counter back to zero.  Until then, we keep counting
+positive to keep track of the vertical blanking time.
+
+But, to implement this... it's tricky because va5 and va6 are not
+connected to any PAL.  How do we generate the signals then?  We can
+otherwise only count to 16, we need to get to 32.
+
+Okay, I think I've got it figured out.  VIAPB6 is a little white lie,
+it's not the actual horizontal blanking signal, it's a prep signal
+before the horizontal blanking actually occurs.  But, nevertheless, it
+is almost the same thing, 16 cycles at 16MHz rather than 12 cycles.
+For the 8 MHz CPU, it pretty much looks like the same thing.  And
+that's where we hide the additional bit of memory we need.
+
+*/
+
 // 32 active cycles for line  - UA6..UA1 = 0 to 1F
 // 1 cycle for sound/PWM                 = 2B
 // 11 cycles for retrace                 = 20 to 2A
@@ -231,11 +302,12 @@ module bmu1(simclk, n_res,
 		 | a23 & ~a21 & ~as); // for generating DTACK (not accessing ROM: A20)
       ramen <= ~(~a23 & ~a22 & ~a21 & ~as & ~ovlay // 000000
 		 | ~a23 & a22 & a21 & ~as & ovlay); // (600000 with `ovlay`)
-      io1 <= ~(0); // ???
+      io1 <= ~(0); // TODO this indicates we're >= line 28
       l28 <= ~(~l15 & ~va9 & ~va8 & va7 // reached 370 or we don't pass line 28
+	       | ~l28 & ~l15
 	       | ~l28 & ~va9
 	       | ~l28 & ~va8
-	       | ~l28 & ~va7);
+	       | ~l28 & va7);
       end
    end
 endmodule
@@ -275,15 +347,15 @@ module bmu0(simclk, n_res,
    end
 
    // Simulate registered logic.
-   always @(negedge sysclk) begin
+   always @(posedge sysclk) begin
       if (n_res) begin
-      ava14 <= @(posedge sysclk) ~(~va14 & ~va13); // + 1
-      l15 <= @(posedge sysclk)
+      ava14 <= ~(~va14 & ~va13); // + 1
+      l15 <=
 	~(~va14 & ~va13 & ~va12 & ~va11 & ~va10 // we haven't passed line 15
 	  | va14 & ~va13 & va12 & va11 & va10); // passed by 368
-      vid <= @(posedge sysclk)
+      vid <=
 	~(servid); // here we invert: blanking is in `vshft`
-      ava13 <= @(posedge sysclk) ~(va13); // + 1
+      ava13 <= ~(va13); // + 1
       end
    end
 endmodule
@@ -321,20 +393,20 @@ module tsg(simclk, n_res,
    end
 
    // Simulate registered logic.
-   always @(negedge sysclk) begin
+   always @(posedge sysclk) begin
       if (n_res) begin
       // TODO VERIFY: q6 missing?
-      q6 <= @(posedge sysclk) ~(0);
-      clkscc <= @(posedge sysclk)
+      q6 <= ~(0);
+      clkscc <=
 	~(clkscc & ~pclk & ~q4
 	  | clkscc & ~pclk & ~q3
 	  | clkscc & ~pclk & vclk
 	  | ~clkscc & pclk
 	  | ~clkscc & q4 & q3 & ~vclk); // skip one inversion every 32 cycles
-      viacb1 <= @(posedge sysclk) ~(0); // ??? /M nanda
-      pclk <= @(posedge sysclk) ~(pclk); // divide SYSCLK by 2 (8MHz)
-      q3 <= @(posedge sysclk) ~(~vclk); // `sysclk` / 16
-      q4 <= @(posedge sysclk)
+      viacb1 <= ~(0); // ??? /M nanda
+      pclk <= ~(pclk); // divide SYSCLK by 2 (8MHz)
+      q3 <= ~(~vclk); // `sysclk` / 16
+      q4 <=
 	~(q4 & q3 & ~vclk // `sysclk` / 32
 	  | ~q4 & ~q3                       // } J for generating CLKSCC
 	  | ~q4 & vclk);
@@ -421,7 +493,7 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk,
    // Incremented high-order video address lines
    wire ava14, ava13;
    // Internal video signals
-   wire n_vshft, n_snddma, n_servid;
+   wire vshft, n_snddma, n_servid;
 
    // Address multiplexer signals?
    wire s0, s1, l28, l15, n_245oe;
@@ -446,13 +518,15 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk,
    wire c8mf, c16mf, c2m, u12f_tc, ram_r_n_w_f;
    wire vmsh; // video mid-shift, connect two register chips together
 
-   wire s5; // This is just a pull-up resistor
+   // This is just a pull-up resistor, possibly connected to a RESET
+   // circuit.
+   wire s5;
 
    // L12 => va13
    // L13 => va14
    // va12 => ava13
    // va13 => ava14
-   // n_ldps => n_vshft
+   // n_ldps => vshft
    // VID/*u => s1 
    // tc => vclk
 
@@ -475,9 +549,13 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk,
    // TSG Output Enable is controlled by CAS on Macintosh Plus,
    // otherwise just go straight to ground on Macintosh 128k.
    assign tsg_oe3 = gnd;
-   // S5: Pull-up resistor.
+   // S5: Pull-up resistor.  TODO FIXME: Should this be controlled by
+   // another thing too?
    assign s5 = vcc;
 
+   // TODO FIXME: A1 - A13 are connected to a pull-up resistors bank
+   // RP1.
+
    /* N.B. The reason why phase calibration is required in the
       Macintosh 128k/512k/Plus is because the PALs do not have a RESET
       pin.  It is the logic designer's discretion to implement one
@@ -495,9 +573,9 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk,
 	      n_dmald, n_sndres, , , , , u12f_tc, vcc);
    // Dual video shift registers
    ls166 u10f(vmsh, rdq8, rdq9, rdq10, rdq11, 1'b0, c16mf, gnd,
-	      s5, rdq12, rdq13, rdq14, n_servid, rdq15, n_vshft, vcc);
+	      s5, rdq12, rdq13, rdq14, n_servid, rdq15, vshft, vcc);
    ls166 u11f(s5, rdq0, rdq1, rdq2, rdq3, 1'b0, c16mf, gnd,
-	      s5, rdq4, rdq5, rdq6, vmsh, rdq7, n_vshft, vcc);
+	      s5, rdq4, rdq5, rdq6, vmsh, rdq7, vshft, vcc);
    // Dual RAM data bus transceivers
    ls245 u9e(ram_r_n_w_f, rdq0, rdq1, rdq2, rdq3, rdq4, rdq5, rdq6, rdq7,
 	     gnd, d7, d6, d5, d4, d3, d2, d1, d0, n_245oe, vcc);
@@ -529,7 +607,7 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk,
    tsm pal0(simclk, n_res, sysclk, sysclk, pclk, s1, n_ramen, n_romen, n_as, n_uds, n_lds,
 	    gnd, tsm_oe1, casl, cash, ras, vclk, p0q2, p0q1, s0, n_dtack, vcc);
    lag pal1(simclk, n_res, sysclk, p2io1, l28, va4, p0q2, vclk, va3, va2, va1,
-	    gnd, lag_oe2, n_vshft, n_vsync, n_hsync, s1, viapb6,
+	    gnd, lag_oe2, vshft, n_vsync, n_hsync, s1, viapb6,
 	    n_snddma, reslin,
 	    resnyb, vcc);
    bmu1 pal2(simclk, n_res, va9, va8, va7, l15, va14, ovlay, a23, a22, a21, gnd,
diff --git a/hardware/fpga/bbu/test_mac128pal.v b/hardware/fpga/bbu/test_mac128pal.v
index b4ce87f..a3d48f9 100644
--- a/hardware/fpga/bbu/test_mac128pal.v
+++ b/hardware/fpga/bbu/test_mac128pal.v
@@ -103,7 +103,10 @@ module test_mac128pal();
 
    // Set simulation time limit.
    initial begin
-      #480000 $finish;
+      // #1920000 $finish;
+      // PLEASE NOTE: We must simulate LOTS of cycles in order to see
+      // what the oscilloscope trace for one video frame looks like.
+      #30720000 $finish;
    end
 
    // We can use `$display()` for printf-style messages and implement
@@ -115,7 +118,9 @@ module test_mac128pal();
 
    // Log to a VCD (Variable Change Dump) file.
    initial begin
-      $dumpfile("test_mac128pal.vcd");
+      // $dumpfile("test_mac128pal.vcd");
+      // Use LXT instead since it is more efficient.
+      $dumpfile("test_mac128pal.lxt");
       $dumpvars;
    end
 endmodule