From 4fa9303844996fc812c1975a3b88137a03d0999c Mon Sep 17 00:00:00 2001 From: Andrew Makousky Date: Fri, 20 Nov 2020 20:19:52 -0600 Subject: [PATCH] BBU: Commit work-in-progress, lots of corrections to be made. --- hardware/fpga/bbu/bbu.v | 157 +++++++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 33 deletions(-) diff --git a/hardware/fpga/bbu/bbu.v b/hardware/fpga/bbu/bbu.v index 5bf361a..63c8fe1 100644 --- a/hardware/fpga/bbu/bbu.v +++ b/hardware/fpga/bbu/bbu.v @@ -50,6 +50,10 @@ `define RAMSZ_EN_4M 7'b1000000 `define RAMSZ_ENI_4M 6 +// Please note: If we don't list the configuration in one of the +// following the tables, it's not supported by the BBU. The BBU is a +// gate array, not a microcontroller! + // The main and alternate screen buffer memory addresses are // calculated by subtracting a constant from the installed RAM size. // Deltas: main -0x5900, alt. -0xd900. @@ -62,10 +66,6 @@ // 2.5MB: main 0x27a700, alt 0x272700. // 4MB: main 0x3fa700, alt 0x3f2700. -// Please note: If we don't list the configuration in the table, it's -// not supported by the BBU. The BBU is a gate array, not a -// microcontroller! - // Table of video memory base addresses. `define vid_main_addr_128k 24'h1a700 `define vid_alt_addr_128k 24'h12700 @@ -94,10 +94,6 @@ // 2.5MB: main 0x27fd00, alt. 0x27a100. // 4MB: main 0x3ffd00, alt. 0x3fa100. -// Please note: If we don't list the configuration in the table, -// it's not supported by the BBU. The BBU is a gate array, not a -// microcontroller! - // Table of sound and disk speed memory base addresses. `define snddsk_main_addr_128k 24'h1fd00 `define snddsk_alt_addr_128k 24'h1a100 @@ -116,11 +112,6 @@ /* Top-level module for the BBU. - TODO Abbreviations legend, here since they are not noted elsewhere: - - RDQ = RAM Data Value - PMCYC = Processor Memory Cycle - Undocumented signal but assumed to exist: 64KRAM: Here we assign it to pin 21 for our own experimentation, a @@ -164,17 +155,7 @@ module bbu_master_ctrl // The Macintosh SE deprecated SNDPG2. Nevertheless... for the // sake of possibly making quasi-hardware replicas of earlier // Macintosh computers easier, we will preserve an implementation - // here anyways. As for the OVERLAY signal, we'll have to use Mini - // vMac's guess on how to implement it. At boot, the ROM - // exclusively accesses the ROM overlay addresses and the remapped - // RAM address when setting up. When it is done, it jumps into the - // standard ROM address access. As soon as the BBU detects this - // memory access (any address in the standard ROM address space), - // it switches the overlay, and it cannot be switched back except - // by a RESET signal. - - // So yes... it turns out the BBU must actually do address bus - // snooping. + // here anyways. // Essential sequential logic RESET and clock signals input wire n_res; // *RESET signal @@ -235,15 +216,22 @@ module bbu_master_ctrl // Note tristate inout ... 'bz for high impedance. 8'bz for wide. - // In order to implement the memory overlay switch, we must snoop - // the address bus. These are the registers we use to store the - // address multiplexor outputs. + // Full DRAM address bus snooping? I almost thought this was + // required to implement some functions, but it turns out it isn't, + // partial address bus snooping is good enough. Nevertheless, I'll + // preserve the implementation as it could be useful for BBU mods. reg [7:0] row_snoop; reg [7:0] col_snoop; // Installed RAM size. wire [23:0] ramsz; - // SCSI support: Handle chip select, and handle DMA. + // SCSI support: Handle chip select, and handle DMA. Important! + // Never have *DACK and *SCSI active simultaneously. Okay, so + // let's get this straight. When we would normally assert *DTACK, + // we release *SCSI and wait for SCSI.DRQ. Then when we receive + // it, we can assert *DTACK and also *DACK, with a timer to + // deassert *DACK. Important! Make sure we do not go faster than + // the minimum read/write pulse width of the SCSI chip. // Important! How to handle SCSI DMA... according to Guide to the // Macintosh family hardware, page 126, this is "pseudo-DMA" mode. @@ -255,9 +243,13 @@ module bbu_master_ctrl // `*DTACK` is tri-stated according to the manual when `*EXTDTK` // (`*EXT.DTACK`) is asserted. + // TODO: How do we know if DMA mode is true? Do we have to snoop + // the address but to get this information. + // Now, here's the golden rule: "If any access has not terminated // within 265 ms, the BBU asserts the bus error signal /BERR." - // There you go, that's how it is driven. So, another thing, + // There you go, that's how it is driven, though the condition only + // happens for PDS and SCSI accesses. So, another thing, // yes... there must be at least a nominal delay to assert `*DTACK` // to allow PDS cards to intervene by asserting `*EXTDTK` first. @@ -407,6 +399,17 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt); // PLEASE NOTE. Macintosh SE/30 takes one access cycle every // 15.6us for DRAM refresh. + // So, what's the secret sauce of the Macintosh SE being more + // performant in memory access? Guide to the Macintosh family + // hardware, page 401. During the BBU memory access cycle time, + // unlike earlier models that would only read one word, the BBU + // reads two 16-bit words. Yes, so it does do buffering! This + // allows the CPU to have free access to the next two cycles. So, + // the word is hard and strong now, `*PMCYC` is not a simple 1MHz + // clock, but has a much more complex timing circuit. That equates + // to a 200% memory access speedup during screen scanning in the + // Macintosh SE compared to the Macintosh Plus. + /* Inside Macintosh claims that the serial clock is 3.672 MHz. Clock multiplication (via PLL) and division can be used to generate this from the 15.6672 MHz clock as follows: @@ -432,9 +435,9 @@ module clock_div (n_res, c16m, c8m, c3_7m, c2m_e, n_pmcyc, pmcyc_pt); And yes, even with that introduced phase/period error, the downstream hardware apparently still works just fine, thanks to the divide-by-16 in front of the SCC's internal baud rate - generator. This gives you a max baud of 230 kbits/sec, with a - phase/period error of 1/(16*16) = 0.39%. - */ + generator. This gives you a max baud of 230.4 kbits/sec, with a + phase/period error of 1/(16*16) = 0.39%. This is the same baud + as AppleTalk. */ // TODO EVALUATE: Optimize this to minimize the number of register // bits required, while still preserving ideal frequency division @@ -699,7 +702,24 @@ endmodule charge, you can do better! And indeed, that note only appears to apply to the Macintosh Plus, not the Macintosh SE, as it is listed in that section. - + + But, for the sake of Macintosh Plus recreation, please note. The + TSG PAL places one of the high-frequency phase indicator signals on + D0 of the address bus, I assume it is the 1 MHz *PMCYC signal. A + multiple address read instruction is used to read three consecutive + data values from the address bus in synchronous I/O mode (due to + using address 0xf00000), so each address read is either 10 or 20 + CPU cycles long. This will sample the phase at a few different + points. The phase readings are added together, if they are zero or + one, then we are "in-phase." Otherwise, phase readings 2 and 3 are + considered "out-of-phase" so we access a word-width address in the + SCC range to shift the high frequency timing by 128 ns (one CPU + clock cycle at 8 MHz). + + The important thing to remember is that every MC68000 instruction + executes for an even number of clock cycles (divisible by 2), and + there is no pipelining in these early CPUs. + TODO FIXME: Guide to the Macintosh family hardware, page 127. Okay, so this is how to interpret the information about the boot-time overlay for the alternate RAM location. Only a 2MB zone @@ -993,6 +1013,77 @@ module a0_to_ds (a0, n_uds, n_lds); assign n_lds = ~a0; endmodule +// TODO: So this is how the fastest CPU access state machine workss. +// *AS is started to be asserted on the rising edge of the CPU clock +// at S2, but it is not guaranteed clearly asserted until the falling +// edge of S2. Between this time, we must assert *DTACK before the +// falling edge of S4. So yes, we effectively have a maximum of two +// 16 MHz cycles to react, but better if we can get it done in 1.5 +// cycles at 16 MHz. We must assert *DTACK before 10 ns of the end, +// so better to go 30 ns before the end, i.e. half of a 16 MHz clock +// cycle. + +// So, point in hand, we need to design our logic to work very fast +// within these constraints. First, we have a cycle counter runs on +// the _falling edge_ of the clock. We only count one. Then, we have +// a "subcycle counter" which is really just combinatorial logic. We +// must only use combinatorial logic in order to be able to assert +// signals sub-cycle in a way that hopefully still works on an FPGA. +// If this really only works on an ASIC, though, our next best option +// is to use a 32 MHz PLL to satisfy the timing requirements. Also, +// it's important to understand, when using combinatorial logic to +// effectively double the clock frequency, understanding signal +// propagation delay is critical. + +// Here, we assume the rising edge of C16M is synced to the rising +// edge of C8M, otherwise this won't work! + +// There's good reason to be concerned about glitching when not using +// register-buffered outputs on a sequential clock. But, yeah, +// hardware-wise, this is the simplest way to do clock frequency +// doubling, and if it works like the 3.686 MHz clock with +// phase/period error, history be told. F.Y.I. Anecdotal evidence +// hints that the Macintosh Plus may have used this combinatorial +// logic hackery on clock signals in its PALs. But it was still +// driven by clock C16M and was a registered PAL? Maybe I better just +// look down its datasheet real good, check that it uses typical +// latching. PAL16R4. + +// Okay, here's the deal. We can get this to work reasonably without +// glitching through the means of programmable slew rate limiting +// filter circuits built into the FPGA at the pin terminals. But that +// is the necessity if we go that path. + +module dram_fast_test (c16m, c8m, n_res, n_as); + input wire c16m; + input wire c8m; + input wire n_res; + input wire n_as; + + reg state_buf; + wire state0, state1, state2, state3; + // wire state4; + + assign state0 = n_as; + assign state1 = ~n_as & ~state_buf & c16m & ~c8m; + assign state2 = ~n_as & ~state_buf & ~c16m & ~c8m; + assign state3 = ~n_as & state_buf & c16m & c8m; + // assign state4 = ~n_as & state_buf & ~c16m & c8m; + + always @(negedge n_res) begin + state_buf <= 0; + end + + always @(posedge c16m) begin + if (n_res) begin + if (~n_as) + state_buf <= 1; + else + state_buf <= 0; + end + end +endmodule + // Stateful advancement DRAM controller logic for CPU memory accesses. module dramctl_cpu (n_res, clk, r_n_w, c2m, ram_r_n_w, n_as, n_ras, n_cas,