From 56dd9574028918ba376da8d5deebb504e96a52a1 Mon Sep 17 00:00:00 2001 From: Andrew Makousky Date: Sun, 27 Dec 2020 09:19:35 -0600 Subject: [PATCH] Mac 128k PAL signals mostly working, hooray! Also start working on BBU video timing signals, and documentation improvements. --- hardware/fpga/bbu/README.md | 64 ++++---- hardware/fpga/bbu/bbu.v | 227 +++++++++++------------------ hardware/fpga/bbu/mac128pal.v | 190 +++++++++++++++++------- hardware/fpga/bbu/test_mac128pal.v | 9 +- 4 files changed, 260 insertions(+), 230 deletions(-) diff --git a/hardware/fpga/bbu/README.md b/hardware/fpga/bbu/README.md index 94b372b..5ed1e23 100644 --- a/hardware/fpga/bbu/README.md +++ b/hardware/fpga/bbu/README.md @@ -79,11 +79,10 @@ a huge number of pins, its purpose can be summarized as follows. * Act as a DRAM controller. Set the ROM/RAM control signals depending on the particular address requested, i.e. `*EN245`, `*ROMEN`, `*RAS`, `*CAS0L`, `*CAS0H`, `*CAS1L`, `*CAS1H`, `RAM R/*W`. - `*PMCYC` is apparently used to totally disable DRAM row and column - access strobes only during startup. The F257 chips are used to - select separate address portions for the DRAM row and column access - strobes. The LS245 chips are used to disable DRAM access during ROM - access. + `*PMCYC` enables the row/column address multiplexers. The F257 + chips are used to select separate address portions for the DRAM row + and column access strobes. The LS245 chips are used to disable DRAM + access during ROM access. DRAM is accessed by sending the row access strobe first, the column access strobe second. @@ -128,7 +127,24 @@ a huge number of pins, its purpose can be summarized as follows. 2. To enable fast-page mode (FPM) for fetching two 16-bit words in sequence (one "longword"). This in turn reduces the BBU's memory access overhead and therefore increases the speed of CPU - memory accesses. + memory accesses. Guide to the Macintosh family hardware, page + 401. + +* Please note that the framebuffer scanning circuitry only refreshes + the DRAM rows controlled by RA0 through RA8. RA9 is only accessed + by software. That means if there is more than one megabyte of DRAM + installed, there must be a software routine to continuously scan a + contiguous 2KB buffer within the first 512KB of RAM (or an alternate + but equivalent strategy). The is the same behavior as was used in + the Macintosh Plus and earlier. Guide to the Macintosh family + hardware, page 194 + + This is a bit of a bummer, but even though I don't quite understand + how 4MB DRAM refresh works, maybe it will "just work" in the real + system. + + PLEASE NOTE. Macintosh SE/30 takes one access cycle every 15.6us + for DRAM refresh. * So, wow. Here's a list of all possible Macintosh SE RAM configurations. @@ -137,26 +153,13 @@ a huge number of pins, its purpose can be summarized as follows. (undocumented), 1MB, 2MB, 4MB. * Refresh the DRAM by periodically reading some arbitrary memory from - every available row. Unlike the Apple II, the contiguous - organization of the screen, sound, and PWM disk speed buffers does - not allow for these periodic functions to double as automatic DRAM + every available row. Similar to the Apple II, the framebuffer scan + doubles as a DRAM refresh. Except that high RAM requires software refresh. How does this need play together with the PDS card's ability to request priority access over `DTACK`? Maybe the refresh circuitry still continues to function, but without driving DTACK for the duration that the PDS card requests driving the signal. - However, one interesting trick is that the address multiplexers are - configured to access alternating DRAM rows when reading consecutive - addresses rather than all coming from a single DRAM row. I am not - sure of the motivation behind this, but it seems like it could have - been extended so that reading consecutive memory addresses would - provide automatic DRAM memory refresh, thus allowing the video - circuitry to double in this role without providing the drawbacks of - nonlinear video memory to software. - - Unfortunately, this scheme also complicates reusing the same DRAM - row for performance improvements. - * Scan the CRT by driving the primary digital control signals (`*VSYNC`, `*HSYNC`, `VIDOUT`). Read directly from RAM buffers as required, and use `*DTACK` to prevent the CPU from accessing RAM at @@ -201,6 +204,11 @@ The following I/O chips are connected to the BBU: Other chips that are connected to the BBU are mainly interfaced via only simple, single-pin interfaces. +Please note that PDS cards can also access DRAM, not just the CPU. +This is mainly a matter of bus arbitration, then as far s the BBU is +concerned, PDS access to DRAM should appear identical to CPU access to +DRAM. Guide to the Macintosh family hardware, page 84. + ---------- ## More explanation on pin functions @@ -268,11 +276,13 @@ only simple, single-pin interfaces. performance and lower memory access time. * `*PMCYC` is an output signal. Its primary conceptual purpose is to - define "whose turn" it is to access DRAM, the CPU or the BBU? This - could be as simple as a 1 MHz clock, since the CPU always takes a - multiple of 4 clock cycles at 8 MHz to access DRAM. The symbol is - probably short for Processor Memory CYCle. It only connects to the - PDS slot and the F257 chips. + define "whose turn" it is to access DRAM, the CPU or the BBU? In + the Macintosh Plus, this was a simple 1 MHz clock, since the CPU + always takes a multiple of 4 clock cycles at 8 MHz to access DRAM. + But the Macintosh SE uses a more sophisticated pattern to give the + CPU as large of a time share as possible to access DRAM. The symbol + is probably short for Processor Memory CYCle. It only connects to + the PDS slot and the F257 chips. ---------- @@ -340,7 +350,7 @@ signals. itself and the CPU is simply instructed to wait additional cycles by holding the `*DTACK` signal deasserted. -* Implement bank switching to allow access to more than 4 MB of RAM +* Implement bank switching to allow access to more than 4MB of RAM without requiring a CPU that is capable of virtual memory. The original MC68000 CPU in particular does not allow for exception-handling that repeats execution of a faulted instruction, diff --git a/hardware/fpga/bbu/bbu.v b/hardware/fpga/bbu/bbu.v index ced6f0d..5e75cb6 100644 --- a/hardware/fpga/bbu/bbu.v +++ b/hardware/fpga/bbu/bbu.v @@ -214,13 +214,11 @@ module bbu_master_ctrl // SCSI signals output wire n_scsi; input wire scsidrq; - output reg n_dack; + output wire n_dack; // PDS signals input wire n_extdtk; output reg n_earen; // ??? Purpose unknown. - // Note tristate inout ... 'bz for high impedance. 8'bz for wide. - // Full DRAM address bus snooping? I almost thought this was // required to implement some functions, but it turns out it isn't, // partial address bus snooping is good enough. Nevertheless, I'll @@ -230,11 +228,8 @@ module bbu_master_ctrl // Installed RAM size. wire [23:0] ramsz; - // TODO MOVE DOCUMENTATION: PLEASE NOTE, PDS cards can also access - // DRAM, not just the CPU. This is mainly a matter of bus - // arbitration, then as far s the BBU is concerned, PDS access to - // DRAM should appear identical to CPU access to DRAM. Guide to - // the Macintosh Family hardware, page 84. + wire n_dtack_peri; // `*DTACK` for peripherals + wire n_dtack_bbu; // Holds `*DTACK` high for BBU RAM accesses ////////////////////////////////////////////////// // Pure combinatorial logic is defined first. @@ -249,45 +244,17 @@ module bbu_master_ctrl // SCSI IRQ line attaches directly to `*IPL0`? assign n_ipl0 = ~n_ipl1 | n_viairq; + // Tri-state `*DTACK` when `*EXTDTK` is asserted. + assign n_dtack = (n_extdtk) ? (n_dtack_peri | n_dtack_bbu) : 'bz; + ////////////////////////////////////////////////// // Sub-modules are instantiated here. // The remainder of definitions are for sequential logic. always @(negedge n_res) begin // Initialize all output registers on RESET. - n_dack <= 1; n_earen <= 1; end - - always @(posedge c16m) begin - if (n_res) begin - // All high speed sequential logic goes here. - end - end - - always @(posedge c8m) begin - if (n_res) begin - // All CPU speed sequential logic goes here. - end - end - - always @(posedge c3_7m) begin - if (n_res) begin - // All peripheral speed sequential logic goes here. - end - end - - always @(posedge c2m) begin - if (n_res) begin - // Only DRAM operations go here. - end - end - - always @(negedge c2m) begin - if (n_res) begin - // Only DRAM operations go here. - end - end endmodule /* @@ -353,7 +320,7 @@ Write down all my questions thus far about the BBU: // Clock divider module. Generate the frequency-divided clock // signals. -module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt); +module clock_div (n_res, c16m, c8m, c3_7m, c2m_e); input wire n_res; input wire c16m; output reg c8m; @@ -362,30 +329,6 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt); // This is just an I/O argument placeholder. We still generate the // signal internally, though. input wire c2m_e; - output wire n_pmcyc; - // *PMCYC "pre-trigger": will the *PMCYC state be negated on the - // next cycle? - output wire pmcyc_pt; - // TODO FIXME: `*PMCYC` should not be a strict 1MHz clock, because - // during vertical blanking, all cycles (except for horizontal - // blanking sound cycles) are fair game for CPU use. PLEASE NOTE: - // According to Guide to the Macintosh family hardware, page 194, - // the process of scanning the screen buffer also refreshes the - // DRAM. But I don't quite understand how this works, wouldn't you - // need to access more addresses to refresh all the DRAM? But, - // PLEASE NOTE. Macintosh SE/30 takes one access cycle every - // 15.6us for DRAM refresh. - - // So, what's the secret sauce of the Macintosh SE being more - // performant in memory access? Guide to the Macintosh family - // hardware, page 401. During the BBU memory access cycle time, - // unlike earlier models that would only read one word, the BBU - // reads two 16-bit words. Yes, so it does do buffering! This - // allows the CPU to have free access to the next two cycles. So, - // the word is hard and strong now, `*PMCYC` is not a simple 1MHz - // clock, but has a much more complex timing circuit. That equates - // to a 200% memory access speedup during screen scanning in the - // Macintosh SE compared to the Macintosh Plus. /* Inside Macintosh claims that the serial clock is 3.672 MHz. Clock multiplication (via PLL) and division can be used to @@ -435,9 +378,7 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt); reg c2m; reg c1m; - assign pmcyc_pt = c16m_div16_cntr[7]; // assign c2m_e = c2m; - assign n_pmcyc = c1m; always @(negedge n_res) begin // Initialize all output registers on RESET. @@ -714,13 +655,13 @@ endmodule executes for an even number of clock cycles (divisible by 2), and there is no pipelining in these early CPUs. */ -module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq, +module decode_devaddr (n_res, c16m, n_ramen, n_romen, n_scsi, scsidrq, n_dack, n_sccen, n_sccrd, n_iow, n_iwm, via_cs1, n_vpa, n_berr, n_as, a23_19, a9, n_extdtk, boot_overlay, r_n_w, reg_romen, reg_ram_w, n_dtack_peri); input wire n_res; - input wire clk; + input wire c16m; output wire n_ramen; output wire n_romen; output wire n_scsi; @@ -744,7 +685,7 @@ module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq, // Has an address access to the regular *ROMEN zone occurred? This // signal is used to disable the boot-time memory overlay. output wire reg_romen; - output wire n_dtack_peri; // *DTACK for peripherals + output wire n_dtack_peri; // `*DTACK` for peripherals wire reg_ram, reg_ram_r; wire scdma; // host requested performing a SCSI pseudo-DMA read/write @@ -826,13 +767,13 @@ module decode_devaddr (n_res, clk, n_ramen, n_romen, n_scsi, scsidrq, // `*DTACK` on high-impedance when `*EXTDTK` is asserted. // assign n_dtack_peri = - // n_extdtk ? (n_as | ((n_vpa | scdma) & n_dack)) : 'bz; + // (n_extdtk) ? (n_as | ((n_vpa | scdma) & n_dack)) : 'bz; always @(negedge n_res) begin berr_cntr <= 0; end - always @(posedge clk) begin + always @(posedge c16m) begin if (n_res) begin if (n_as) berr_cntr <= 0; @@ -879,7 +820,7 @@ endmodule // Column address strobe decode logic. Determine which column access // strobe line to assert based off of the installed RAM, high-order -// CPU address lines, and *LDS/*UDS signals. +// CPU address lines, and *UDS/*LDS signals. module dramctl_cas (n_cas, n_cas0h, n_cas0l, n_cas1h, n_cas1l, n_uds, n_lds, row2, mbram, s64kram, a17, a19, a21); @@ -904,7 +845,6 @@ endmodule // RA7/RA9 selector logic. Determine which CPU address pins should be // routed to these RAM address pins based off of the installed RAM. -// TODO FIXME: This is incorrect in light of new knowledge. module dramctl_ra7_9 (ra7, ra9, cas_n_ras, row2, mbram, s64kram, a9, a17, a19, a20, a10); output wire ra7; @@ -920,11 +860,11 @@ module dramctl_ra7_9 (ra7, ra9, cas_n_ras, row2, mbram, s64kram, = (s64kram) ? // 64K RAM SIMMs (~cas_n_ras) ? a9 : a10 : // 256K RAM SIMMs and 1MB RAM SIMMs - (~cas_n_ras) ? a17 : a9 + (~cas_n_ras) ? a9 : a17 ; assign ra9 = (mbram) ? // 1MB RAM SIMMs - (~cas_n_ras) ? a20 : a19 + (~cas_n_ras) ? a19 : a20 : // <1MB RAM SIMMs 0 // RA9 is not used ; @@ -1060,8 +1000,11 @@ module dramctl_cpu (n_res, clk, r_n_w, c2m, // At a higher level, it is used to determine whether it is the // CPU's turn to access RAM or the BBU's turn to access RAM. The // CPU always takes a multiple of 4 clock cycles running at 8 MHz - // to access RAM. This signal could possibly be just wired up to a - // 1 MHz clock. + // to access RAM. In the Macintosh Plus, this signal was wired up + // to a 1 MHz clock, but the Macintosh SE uses a more sophisticated + // approach. + // TODO FIXME: Implement `*PMCYC` generation logic, comes from the + // video timers module. input wire n_pmcyc; // output reg n_pmcyc; output reg n_dtack; @@ -1354,6 +1297,10 @@ endmodule // exercise because of Verilog silliness. Actually, might as well // make two modules since that is all that is needed to start: one for // video, one for DRAM. + +// TODO FIXME: We must be able to support Fast Page Mode (FPM) for +// video memory access too. But we don't do this for the sound +// buffer. module fetch_vid_addr (n_res, clk, n_as, a, vidreg, s64kram); input wire n_res; input wire clk; @@ -1399,9 +1346,21 @@ module avtimers (); input wire n_res; input wire c16m; + input wire c8m; + input wire c4m; + input wire c2m; + input wire c1m; + + input wire [23:0] vid_main_addr; // Address of main video buffer + input wire [23:0] vid_alt_addr; // Address of alternate video buffer + input wire [23:0] snddsk_main_addr; // Address of main sound/disk buffer + // Address of alternate sound/disk buffer + input wire [23:0] snddsk_alt_addr; + // Video signals input wire vidpg2; // VIDPG2 signal - output reg vidout; // VIDOUT signal + output wire vidout; // VIDOUT signal + output wire n_hsync_pt; // *HSYNC pre-trigger output reg n_hsync; // *HSYNC signal output reg n_vsync; // *VSYNC signal @@ -1423,12 +1382,10 @@ module avtimers (); // *HSYNC and *VSYNC counters are negative during blanking. reg [15:0] vidout_sreg; // VIDOUT shift register - reg [4:0] vidout_cntr; // VIDOUT remaining counter - reg [9:0] vid_hsync_cntr; // *HSYNC counter - reg [8:0] vid_vsync_cntr; // *VSYNC counter - - wire [23:0] vid_main_addr; // Address of main video buffer - wire [23:0] vid_alt_addr; // Address of alternate video buffer + wire [4:0] c16m_cntr; // 16 MHz sub-cycle counter + reg n_ldps; + reg slice_cntr; // Used to alter carry propagation + reg [14:0] va; // Video address counter // Sound and disk speed buffers are scanned 370 words per video // frame, and the size of both buffers together is 370 words. Or, @@ -1453,9 +1410,6 @@ module avtimers (); reg [15:0] snddsk_reg; // PCM sound sample and disk speed register - wire [23:0] snddsk_main_addr; // Address of main sound/disk buffer - wire [23:0] snddsk_alt_addr; // Address of alternate sound/disk buffer - // We must be careful that the sound circuitry does not attempt to // access RAM at the same time as the video circuitry. Because the // phases are coherent, we can simply align the sound and disk @@ -1478,26 +1432,62 @@ module avtimers (); // been used. This is going to be a one-shot countdown timer for // generating a single pulse per byte. + // The current 16 MHz cycle # can easily be determined from our + // divided clock frequencies. + assign c16m_cntr = { c1m, c2m, c4m, c8m }; + assign vidout = vidout_sreg[15]; + always @(negedge n_res) begin // Initialize all output registers on RESET. - vidout <= 0; n_hsync <= 1; n_vsync <= 1; + n_hsync <= 1; n_vsync <= 1; snd <= 0; pwm <= 0; // Initialize all internal registers on RESET. vidout_sreg <= 0; - vidout_cntr <= 0; - vid_hsync_cntr <= 0; - vid_vsync_cntr <= 0; + va <= 0; snddsk_reg <= 0; end + + // N.B. Now this is tricky. Our load pixel shifter is carefully + // timed to happen immediately after the last pixel is displayed + // and as soon as the next value is available from DRAM. This + // means that we actually offset the horizontal blanking signal by + // a nominal amount in comparison to the video address counter + // increments to compensate. + + // Okay, here's the trick with FPM fetches. We still need to count + // by 16 on the video address so we can time the 16-bit sound load + // at the end of the cycle correctly, but we use a double-width + // video shift register and only trigger video load half as often. + + always @(posedge c16m) begin + if (n_ldps) begin + // Fill the least significant bit with logic one so that the + // CRT beam is off during blanking. + vidout_sreg <= { vidout_sreg[14:0], 1'b1 }; + end + else + vidout_sreg <= 0; // TODO load new value. + + // Increment the video address on every 1 MHz clock cycle. + // However, on horizontal blanking, we slice the carry until the + // end of the interval. + if (c16m_cntr == 4'hf) begin + // N.B.: Remember we are counting by 16-bit words. + if (slice_cntr) + va[4:0] <= va[4:0] + 2; + else + va <= va + 2; + end + end endmodule /* TODO: Summary of what is missing and left to implement: DRAM initialization pulses, DRAM refresh, detect 2.5MB of RAM and configure address buffers accordingly, video, disk, and audio - scanout, EXTDTK yielding. + scanout. Okay, so the VERDICT on DRAM initialization pulses. We don't actually use these as we should, strictly speaking, but why does it @@ -1512,57 +1502,4 @@ endmodule 4MB RAM DRAM refresh. Then we need to do the busywork to implement the PWM and video scanout modules and we're done! */ -/* - -Now I think I see why there is the funny thing going on with the -address multiplexers for RAS/CAS. It is a required modification to -use DRAM fast-page mode since RAS and CAS are still logically -"swapped" compared to a contiguous memory layout. This swapping of -RAS and CAS is used to get DRAM refresh for free when scanning the -video framebuffer. - -Okay, so let's review in more detail. - -Address multiplexer row address outputs: - -A2, A3, A4, A5, A6, A7, A8, A10 - -A9 inputs directly to BBU, controls RA7. - -This is a straight match-up to DRAM row address lines. - -RA0 A2 -RA1 A3 -RA2 A4 -RA3 A5 -RA4 A6 -RA5 A7 -RA6 A8 -RA7 A9 -RA8 A10 -RA9 A19 (optional) (!) - -So, how many longwords for the video framebuffer? - -512 x 342 / 32 = 5472 longwords -In hex: 0x1560 -Number of address bits fully covered by a full scan: 12 - -Okay, so the question, does it work for DRAM refresh? Indeed it does! -Well, the number of longwords swept is great enough to cover all DRAM -rows for 4MB of RAM, but the address bit mapping appears only to work -for <=1MB of RAM. - -Please note that since we use only a single row access strobe signal -for both DRAM rows and instead use separate column access strobes to -differentiate between the rows, even if all the video memory addresses -are only in one row, we still refresh the other row as long as we -cover all the row addresses. - -RA9 looks to be trouble. But, the Unitron reverse engineering docs -almost have a solution. Set this to A17 (?) and it should "just work" -I guess. But why? - -*/ - `endif // NOT BBU_V diff --git a/hardware/fpga/bbu/mac128pal.v b/hardware/fpga/bbu/mac128pal.v index 1fb34e5..84f6726 100644 --- a/hardware/fpga/bbu/mac128pal.v +++ b/hardware/fpga/bbu/mac128pal.v @@ -82,21 +82,21 @@ module tsm(simclk, n_res, end // Simulate registered logic. - always @(negedge clk) begin + always @(posedge clk) begin if (n_res) begin - ras <= @(posedge clk) + ras <= ~(~pclk & q1 & s1 // video cycle | ~pclk & q1 & ~ramen & dtack // processor cycle | pclk & ~ras); // any other cycle - vclk <= @(posedge clk) + vclk <= ~(~q1 & pclk & q2 & vclk // divide by 8 (1MHz) | ~vclk & q1 | ~vclk & ~pclk | ~vclk & ~q2); - q1 <= @(posedge clk) + q1 <= ~(~pclk & q1 | pclk & ~q1); // divide `pclk` by 2 (4MHz) - q2 <= @(posedge clk) + q2 <= ~(~q1 & pclk & q2 // divide by 4 (2MHz) | ~q2 & q1 | ~q2 & ~pclk); @@ -136,63 +136,134 @@ module lag(simclk, n_res, end // Simulate registered logic. - always @(negedge sysclk) begin + always @(posedge sysclk) begin if (n_res) begin - vshft <= @(posedge sysclk) + vshft <= ~(s1 & ~vclk & snddma); // one pulse on the falling edge of `vclk` - vsync <= @(posedge sysclk) + vsync <= ~(reslin | ~vsync & ~l28); - // hsync <= @(posedge sysclk) + // hsync <= // ~(viapb6 & va4 & ~va3 & ~va2 & va1 // begins in 29 (VA5) // | /*~ ???*/resnyb // | ~hsync & viapb6); // ends in 0F - hsync <= @(posedge sysclk) - ~(~viapb6 & ~va4 & ~va3 & va2 & va1 // begins in 29 (VA5) + // hsync <= + // ~(~viapb6 & ~va4 & ~va3 & va2 & va1 // begins in 29 (VA5) + // | ~hsync & ~va4 + // | ~hsync & ~viapb6); // ends in 0F + // TODO FIXME: This is incorrect, temporary equations in order + // to get at least partial behavior for analysis. + // TODO FIXME: We trigger hsync a bit too soon at the end of the + // scanline. And, we release it too soon at the beginning. + hsync <= + ~(viapb6 & ~p0q2 & s1 & ~va1 & ~va2 & ~va3 & ~va4 | ~hsync & ~va4 - | ~hsync & ~viapb6); // ends in 0F - s1 <= @(posedge sysclk) + | ~hsync & ~va3 + | ~hsync & va2 + | ~hsync & va1); + // TODO FIXME: This is not a synthesizable PAL equation. + // TODO FIXME: This isn't generating sound buffer accesses + // during vertical blanking but it should be. + s1 <= ~(~p0q2 // 0 for processor and 1 for video | ~vclk | ~vsync & hsync - | ~vsync & viapb6 // only in vertical retrace we have sound cycles - | ~viapb6 & hsync & ~va4 & ~va3 & ~va2 + | ~vsync & viapb6 // vertical retrace only has sound cycles + /* TODO INVESTIGATE: Line disabled because it drops out + pulses we'd normally expect. */ + /* | ~viapb6 & hsync & ~va4 & ~va3 & ~va2 */ | ~viapb6 & ~hsync & (~va4 | va4 & ~va3 & ~va2 | va4 & ~va3 & va2 & ~va1)); - // viapb6 <= @(posedge sysclk) + // viapb6 <= // ~(~hsync & resnyb // 1 indicates horizontal retrace (pseudo VA6) // | va1 & ~viapb6 // | va2 & ~viapb6 // | ~hsync & ~viapb6 // | resnyb & ~viapb6 // | vshft & ~viapb6); - viapb6 <= @(posedge sysclk) - ~(hsync & ~va4 & ~va3 & va2 & va1 // 1 indicates horizontal retrace (pseudo VA6) - | ~viapb6 & snddma - | ~viapb6 & vclk); - snddma <= @(posedge sysclk) - ~(viapb6 & va4 & ~va3 & va2 & va1 & p0q2 & vclk & ~hsync // 0 in this output + // viapb6 <= + // ~(hsync & ~va4 & ~va3 & va2 & va1 // 1 indicates horizontal retrace (pseudo VA6) + // | ~viapb6 & snddma + // | ~viapb6 & vclk); + // TODO FIXME: This is incorrect, temporary equations in order + // to get at least partial behavior for analysis. + viapb6 <= + ~(~hsync // 1 indicates horizontal retrace (pseudo VA6) + | ~viapb6 & p0q2 + | ~viapb6 & ~s1 + | ~viapb6 & va1 + | ~viapb6 & va2 + | ~viapb6 & va3 + | ~viapb6 & va4); + // TODO FIXME HACK: Previously viapb6 but negated for testing. + snddma <= + ~(~viapb6 & va4 & ~va3 & va2 & va1 & p0q2 & vclk & ~hsync // 0 in this output | ~snddma & vclk); // ... indicates sound cycle - reslin <= @(posedge sysclk) // try to generate line 370 + // TODO FIXME HACK: Previously ~viapb6 but negated for testing. + reslin <= // try to generate line 370 ~(l28 | ~vsync | hsync - | ~viapb6 - | ~vclk); - resnyb <= @(posedge sysclk) - ~(vclk // increment VA5:VA14 in 0F and 2B - | viapb6 // ??? - | va1 - | va2 - | ~viapb6 & va3 - | hsync - | viapb6 & ~va3 - | ~hsync & va3 & ~va4 - | ~hsync & ~va3 & va4); + | viapb6 + | ~vclk); + // N.B. Primary conceptual equation: + // resnyb <= + // ((~viapb6 & hsync & ~va4 & ~va3 & ~va2 & ~va1) + // | (~viapb6 & ~hsync & va4 & va3 & ~va2 & ~va1)); + // TODO FIXME HACK: Possibly incorrect interpretation of viapb6 + // with hsync. + resnyb <= + ~(vclk // increment VA5:VA14 in 0F and 2B + | viapb6 + | va1 + | va2 + | hsync & va4 + | hsync & va3 + | ~hsync & ~va4 + | ~hsync & ~va3 + | ~va4 & va3 + | va4 & ~va3); end end endmodule +/* +This LAG doesn't work correctly, here's how it is supposed to work. + +1. Count video addresses to 32. During this count, generate resnyb + every time we need a carry. + +2. Once we reach 32, that's 512 pixels, one scanline. Now, we assert + the *HSYNC signal. But please note, at this point we do **not** + generate resnyb for the carry at 32, instead we let that counter + wrap around to zero without a carry. + +3. When the *HSYNC signal is asserted, we only count to 12. That's + 192 pixels for horizontal blanking. Just when we're about to reach + the end, we assert the *SNDDMA signal. That's when we fetch the + sound sample, at the very end of horizontal blanking, not the very + beginning. + + Finally, we assert resnyb to clear the counter and finally + propagate the carry to the next video address. + +At the very end of vertical blanking, we assert *RESLIN to clear the +video address counter back to zero. Until then, we keep counting +positive to keep track of the vertical blanking time. + +But, to implement this... it's tricky because va5 and va6 are not +connected to any PAL. How do we generate the signals then? We can +otherwise only count to 16, we need to get to 32. + +Okay, I think I've got it figured out. VIAPB6 is a little white lie, +it's not the actual horizontal blanking signal, it's a prep signal +before the horizontal blanking actually occurs. But, nevertheless, it +is almost the same thing, 16 cycles at 16MHz rather than 12 cycles. +For the 8 MHz CPU, it pretty much looks like the same thing. And +that's where we hide the additional bit of memory we need. + +*/ + // 32 active cycles for line - UA6..UA1 = 0 to 1F // 1 cycle for sound/PWM = 2B // 11 cycles for retrace = 20 to 2A @@ -231,11 +302,12 @@ module bmu1(simclk, n_res, | a23 & ~a21 & ~as); // for generating DTACK (not accessing ROM: A20) ramen <= ~(~a23 & ~a22 & ~a21 & ~as & ~ovlay // 000000 | ~a23 & a22 & a21 & ~as & ovlay); // (600000 with `ovlay`) - io1 <= ~(0); // ??? + io1 <= ~(0); // TODO this indicates we're >= line 28 l28 <= ~(~l15 & ~va9 & ~va8 & va7 // reached 370 or we don't pass line 28 + | ~l28 & ~l15 | ~l28 & ~va9 | ~l28 & ~va8 - | ~l28 & ~va7); + | ~l28 & va7); end end endmodule @@ -275,15 +347,15 @@ module bmu0(simclk, n_res, end // Simulate registered logic. - always @(negedge sysclk) begin + always @(posedge sysclk) begin if (n_res) begin - ava14 <= @(posedge sysclk) ~(~va14 & ~va13); // + 1 - l15 <= @(posedge sysclk) + ava14 <= ~(~va14 & ~va13); // + 1 + l15 <= ~(~va14 & ~va13 & ~va12 & ~va11 & ~va10 // we haven't passed line 15 | va14 & ~va13 & va12 & va11 & va10); // passed by 368 - vid <= @(posedge sysclk) + vid <= ~(servid); // here we invert: blanking is in `vshft` - ava13 <= @(posedge sysclk) ~(va13); // + 1 + ava13 <= ~(va13); // + 1 end end endmodule @@ -321,20 +393,20 @@ module tsg(simclk, n_res, end // Simulate registered logic. - always @(negedge sysclk) begin + always @(posedge sysclk) begin if (n_res) begin // TODO VERIFY: q6 missing? - q6 <= @(posedge sysclk) ~(0); - clkscc <= @(posedge sysclk) + q6 <= ~(0); + clkscc <= ~(clkscc & ~pclk & ~q4 | clkscc & ~pclk & ~q3 | clkscc & ~pclk & vclk | ~clkscc & pclk | ~clkscc & q4 & q3 & ~vclk); // skip one inversion every 32 cycles - viacb1 <= @(posedge sysclk) ~(0); // ??? /M nanda - pclk <= @(posedge sysclk) ~(pclk); // divide SYSCLK by 2 (8MHz) - q3 <= @(posedge sysclk) ~(~vclk); // `sysclk` / 16 - q4 <= @(posedge sysclk) + viacb1 <= ~(0); // ??? /M nanda + pclk <= ~(pclk); // divide SYSCLK by 2 (8MHz) + q3 <= ~(~vclk); // `sysclk` / 16 + q4 <= ~(q4 & q3 & ~vclk // `sysclk` / 32 | ~q4 & ~q3 // } J for generating CLKSCC | ~q4 & vclk); @@ -421,7 +493,7 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk, // Incremented high-order video address lines wire ava14, ava13; // Internal video signals - wire n_vshft, n_snddma, n_servid; + wire vshft, n_snddma, n_servid; // Address multiplexer signals? wire s0, s1, l28, l15, n_245oe; @@ -446,13 +518,15 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk, wire c8mf, c16mf, c2m, u12f_tc, ram_r_n_w_f; wire vmsh; // video mid-shift, connect two register chips together - wire s5; // This is just a pull-up resistor + // This is just a pull-up resistor, possibly connected to a RESET + // circuit. + wire s5; // L12 => va13 // L13 => va14 // va12 => ava13 // va13 => ava14 - // n_ldps => n_vshft + // n_ldps => vshft // VID/*u => s1 // tc => vclk @@ -475,9 +549,13 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk, // TSG Output Enable is controlled by CAS on Macintosh Plus, // otherwise just go straight to ground on Macintosh 128k. assign tsg_oe3 = gnd; - // S5: Pull-up resistor. + // S5: Pull-up resistor. TODO FIXME: Should this be controlled by + // another thing too? assign s5 = vcc; + // TODO FIXME: A1 - A13 are connected to a pull-up resistors bank + // RP1. + /* N.B. The reason why phase calibration is required in the Macintosh 128k/512k/Plus is because the PALs do not have a RESET pin. It is the logic designer's discretion to implement one @@ -495,9 +573,9 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk, n_dmald, n_sndres, , , , , u12f_tc, vcc); // Dual video shift registers ls166 u10f(vmsh, rdq8, rdq9, rdq10, rdq11, 1'b0, c16mf, gnd, - s5, rdq12, rdq13, rdq14, n_servid, rdq15, n_vshft, vcc); + s5, rdq12, rdq13, rdq14, n_servid, rdq15, vshft, vcc); ls166 u11f(s5, rdq0, rdq1, rdq2, rdq3, 1'b0, c16mf, gnd, - s5, rdq4, rdq5, rdq6, vmsh, rdq7, n_vshft, vcc); + s5, rdq4, rdq5, rdq6, vmsh, rdq7, vshft, vcc); // Dual RAM data bus transceivers ls245 u9e(ram_r_n_w_f, rdq0, rdq1, rdq2, rdq3, rdq4, rdq5, rdq6, rdq7, gnd, d7, d6, d5, d4, d3, d2, d1, d0, n_245oe, vcc); @@ -529,7 +607,7 @@ module palcl(simclk, vcc, gnd, n_res, n_sysclk, tsm pal0(simclk, n_res, sysclk, sysclk, pclk, s1, n_ramen, n_romen, n_as, n_uds, n_lds, gnd, tsm_oe1, casl, cash, ras, vclk, p0q2, p0q1, s0, n_dtack, vcc); lag pal1(simclk, n_res, sysclk, p2io1, l28, va4, p0q2, vclk, va3, va2, va1, - gnd, lag_oe2, n_vshft, n_vsync, n_hsync, s1, viapb6, + gnd, lag_oe2, vshft, n_vsync, n_hsync, s1, viapb6, n_snddma, reslin, resnyb, vcc); bmu1 pal2(simclk, n_res, va9, va8, va7, l15, va14, ovlay, a23, a22, a21, gnd, diff --git a/hardware/fpga/bbu/test_mac128pal.v b/hardware/fpga/bbu/test_mac128pal.v index b4ce87f..a3d48f9 100644 --- a/hardware/fpga/bbu/test_mac128pal.v +++ b/hardware/fpga/bbu/test_mac128pal.v @@ -103,7 +103,10 @@ module test_mac128pal(); // Set simulation time limit. initial begin - #480000 $finish; + // #1920000 $finish; + // PLEASE NOTE: We must simulate LOTS of cycles in order to see + // what the oscilloscope trace for one video frame looks like. + #30720000 $finish; end // We can use `$display()` for printf-style messages and implement @@ -115,7 +118,9 @@ module test_mac128pal(); // Log to a VCD (Variable Change Dump) file. initial begin - $dumpfile("test_mac128pal.vcd"); + // $dumpfile("test_mac128pal.vcd"); + // Use LXT instead since it is more efficient. + $dumpfile("test_mac128pal.lxt"); $dumpvars; end endmodule