verilog: femtorv32 RISC-V module, fixes to assembler

2026-04-19 08:27:40 +00:00 · 2025-11-05 17:41:45 -06:00
parent 69e2be6f70
commit a436ac89c2
7 changed files with 1348 additions and 28 deletions
@@ -0,0 +1,530 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+// This version: The "Quark", the most elementary version of FemtoRV32.
+//             A single VERILOG file, compact & understandable code.
+//             (200 lines of code, 400 lines counting comments)
+//
+// Instruction set: RV32I + RDCYCLES
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Macros:
+//    optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
+//              evaluate to 1 if addr is in mapped IO space,
+//              evaluate to 0 otherwise
+//    (additional wait states are used when in IO space).
+//    If left undefined, wait states are always used.
+//
+//    NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
+//    by the ticks counter. If not defined, a 32-bits counter is generated.
+//    (reducing its width may be useful for space-constrained designs).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32i"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-Os"
+
+module FemtoRV32(
+   input         clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The destination register
+ wire [4:0] rdId = instr[11:7];
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *)
+ wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+ // The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
+ wire [31:0] Uimm = {    instr[31],   instr[30:12], {12{1'b0}}};
+ wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
+ /* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
+ wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
+ wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+ wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+ /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- cycles
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except shifts.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter =
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   // internal address registers and cycles counter may have less than
+   // 32 bits, so we deactivate width test for mem_addr and writeBackData
+
+   wire [ADDR_WIDTH-1:0] PC_new =
+      isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+      jumpToPCplusImm ? PCplusImm                      :
+                        PCplus4;
+
+   assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? PC     :
+                     state[EXECUTE_bit] & ~isLoad & ~isStore        ? PC_new :
+                                                              loadstore_addr ;
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? cycles     : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm       : 32'b0) |  // LUI
+      (isALU               ? aluOut     : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm  : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4    : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data  : 32'b0) ;  // Load
+
+   /* verilator lint_on WIDTH */
+
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ?
+	            (loadstore_addr[1] ?
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+	      mem_halfwordAccess ?
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE_bit         = 2;
+   localparam WAIT_ALU_OR_MEM_bit = 3;
+   localparam NB_STATES           = 4;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE         = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) &
+                    (state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & ~isStore | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR
+   wire needToWait = isLoad |
+                     isStore  & `NRV_IS_IO_ADDR(mem_addr) ;
+`else
+   wire needToWait = isLoad | isStore ;
+`endif
+
+   always @(posedge clk) begin
+      // Handle reset (high) signal
+      if(reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE;         // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE_bit]: begin
+           PC <= PC_new;
+           state <= needToWait ? WAIT_ALU_OR_MEM : WAIT_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;
+`else
+   reg [31:0]  cycles;
+`endif
+   always @(posedge clk) cycles <= cycles + 1;
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
+// https://github.com/BrunoLevy/learn-fpga/blob/master/FemtoRV/RTL/PROCESSOR/femtorv32_quark_bicycle.v
+
+
+`ifdef TOPMOD__test_FemtoRV32_top
+
+module test_FemtoRV32_top(
+  input  clk,
+  input  reset,
+  output [31:0] address_bus,
+  output [31:0] to_cpu,
+  output [31:0] from_cpu,
+  output [3:0] write_mask,
+  output write_strobe,
+  output [23:0] PC,
+  output [3:0] state_bits
+);
+
+  // Memory arrays
+  reg [31:0] ram[0:4095];   // 16KB RAM
+  reg [31:0] rom[0:255];    // 1KB ROM at high addresses
+
+  // Memory read data
+  reg [31:0] mem_rdata;
+
+  // Memory busy signals (always ready for this simple testbench)
+  wire mem_rbusy = 0;
+  wire mem_wbusy = 0;
+
+  // CPU interface wires
+  wire [31:0] mem_addr;
+  wire [31:0] mem_wdata;
+  wire [3:0]  mem_wmask;
+  wire        mem_rstrb;
+
+  // Expose signals for debugging
+  assign address_bus = mem_addr;
+  assign from_cpu = mem_wdata;
+  assign write_mask = mem_wmask;
+  assign write_strobe = |mem_wmask;  // Any write mask bit set means write
+  assign to_cpu = mem_rdata;
+  assign PC = cpu.PC;
+  assign state_bits = cpu.state[3:0];
+
+  // Instantiate the FemtoRV32 CPU
+  FemtoRV32 #(
+    .RESET_ADDR(32'h00001000),  // Start from ROM area
+    .ADDR_WIDTH(24)
+  ) cpu (
+    .clk(clk),
+    .mem_addr(mem_addr),
+    .mem_wdata(mem_wdata),
+    .mem_wmask(mem_wmask),
+    .mem_rdata(mem_rdata),
+    .mem_rstrb(mem_rstrb),
+    .mem_rbusy(mem_rbusy),
+    .mem_wbusy(mem_wbusy),
+    .reset(reset)
+  );
+
+  // Memory address decoding
+  wire ram_sel = (mem_addr[31:12] == 20'h00000);  // 0x0000-0x0FFF: RAM
+  wire rom_sel = (mem_addr[31:12] == 20'h00001);  // 0x1000-0x1FFF: ROM
+
+  // Memory write logic
+  always @(posedge clk) begin
+    if (ram_sel) begin
+      if (mem_wmask[0]) ram[mem_addr[13:2]][7:0]   <= mem_wdata[7:0];
+      if (mem_wmask[1]) ram[mem_addr[13:2]][15:8]  <= mem_wdata[15:8];
+      if (mem_wmask[2]) ram[mem_addr[13:2]][23:16] <= mem_wdata[23:16];
+      if (mem_wmask[3]) ram[mem_addr[13:2]][31:24] <= mem_wdata[31:24];
+    end
+  end
+
+  // Memory read logic (combinatorial for simplicity)
+  always @(*) begin
+    if (rom_sel)
+      mem_rdata = rom[mem_addr[9:2]];  // Word-aligned ROM access
+    else if (ram_sel)
+      mem_rdata = ram[mem_addr[13:2]]; // Word-aligned RAM access
+    else
+      mem_rdata = 32'h00000000;
+  end
+
+`ifdef EXT_INLINE_ASM
+  initial begin
+    rom = '{
+      __asm
+.arch riscv
+.org 4096
+.len 256
+
+; RISC-V Fibonacci test program
+; x1 = current fib number
+; x2 = previous fib number
+; x3 = temporary
+; Address 0 in RAM will store results
+
+start:
+    addi x1, x0, 1      ; x1 = 1 (first fibonacci number)
+    addi x2, x0, 0      ; x2 = 0 (second fibonacci number)
+
+loop:
+    add x3, x1, x2      ; x3 = x1 + x2 (next fibonacci)
+    addi x2, x1, 0      ; x2 = x1 (shift previous)
+    addi x1, x3, 0      ; x1 = x3 (shift current)
+
+    sw x1, 0(x0)        ; Store result to RAM address 0
+    lw x4, 0(x0)        ; Load it back to x4
+
+    beq x0, x0, loop    ; Loop forever
+
+      __endasm
+    };
+  end
+`endif
+
+endmodule
+
+`endif
+
@@ -0,0 +1,146 @@
+
+`include "hvsync_generator.v"
+`include "femtorv32.v"
+
+module frame_buffer_riscv_top(clk, reset, hsync, vsync, hpaddle, vpaddle, rgb);
+
+  input clk, reset;
+  input hpaddle, vpaddle;
+  output hsync, vsync;
+  wire display_on;
+  wire [8:0] hpos;
+  wire [8:0] vpos;
+  output reg [3:0] rgb;
+
+  // Memory: 16KB RAM + 4KB ROM
+  reg [31:0] ram[0:16383];   // RAM (16384 x 32 bits = 64KB)
+  reg [31:0] rom[0:1023];   // ROM (1024 x 32 bits = 4KB)
+
+  // FemtoRV32 CPU interface signals
+  wire [31:0] mem_addr;
+  wire [31:0] mem_wdata;
+  wire [3:0]  mem_wmask;
+  reg  [31:0] mem_rdata;
+  wire        mem_rstrb;
+  reg         mem_rbusy;
+  reg         mem_wbusy;
+
+  // Instantiate FemtoRV32 CPU
+  FemtoRV32 #(
+    .RESET_ADDR(32'h00010000),  // Start execution from ROM area
+    .ADDR_WIDTH(24)              // 64KB address space
+  ) cpu (
+    .clk(clk),
+    .reset(reset),               // FemtoRV32 reset (active high based on code)
+    .mem_addr(mem_addr),
+    .mem_wdata(mem_wdata),
+    .mem_wmask(mem_wmask),
+    .mem_rdata(mem_rdata),
+    .mem_rstrb(mem_rstrb),
+    .mem_rbusy(mem_rbusy),
+    .mem_wbusy(mem_wbusy)
+  );
+
+  // Memory address decoding
+  wire ram_sel = (mem_addr[15] == 1'b0);   // 0x0000-0xFFFF: RAM (64KB)
+  wire rom_sel = (mem_addr[16:13] == 4'b1000); // 0x10000-0x10FFF: ROM (4KB)
+
+  // Memory read logic
+  always @(posedge clk) begin
+    if (mem_rstrb) begin
+      mem_rbusy <= 1;
+      if (rom_sel)
+        mem_rdata <= rom[mem_addr[11:2]];  // Word-aligned ROM access
+      else if (ram_sel)
+        mem_rdata <= ram[mem_addr[15:2]];  // Word-aligned RAM access
+      else
+        mem_rdata <= 32'h00000000;
+    end else begin
+      mem_rbusy <= 0;
+    end
+  end
+
+  // Memory write logic (synchronous)
+  always @(posedge clk) begin
+    if (ram_sel && |mem_wmask) begin
+      mem_wbusy <= 1;
+      // Byte-wise write masking
+      if (mem_wmask[0]) ram[mem_addr[15:2]][7:0]   <= mem_wdata[7:0];
+      if (mem_wmask[1]) ram[mem_addr[15:2]][15:8]  <= mem_wdata[15:8];
+      if (mem_wmask[2]) ram[mem_addr[15:2]][23:16] <= mem_wdata[23:16];
+      if (mem_wmask[3]) ram[mem_addr[15:2]][31:24] <= mem_wdata[31:24];
+    end else begin
+      mem_wbusy <= 0;
+    end
+  end
+
+  // Video sync generator
+  hvsync_generator hvsync_gen(
+    .clk(clk),
+    .reset(0),
+    .hsync(hsync),
+    .vsync(vsync),
+    .display_on(display_on),
+    .hpos(hpos),
+    .vpos(vpos)
+  );
+
+  // Video framebuffer rendering
+  reg [13:0] vindex;       // Index into framebuffer
+  reg [31:0] vshift;       // Shift register with current word to output
+
+  always @(posedge clk) begin
+    if (display_on) begin
+      // Load next word from RAM every 8 pixels (32 bits = 8 pixels at 4bpp)
+      if (hpos[2:0] == 3'b000) begin
+        vshift <= ram[vindex];  // Read from framebuffer area (0x2000+)
+        vindex <= vindex + 1;
+      end else begin
+        vshift <= vshift >> 4;  // Shift next 4-bit pixel
+      end
+      // Decode scanline RAM to RGB output
+      rgb <= vshift[3:0];
+    end else begin
+      rgb <= 0;  // Set color to black
+      if (vsync) vindex <= 0;  // Reset vindex every frame
+    end
+  end
+
+  // Test program - simple pattern generator
+`ifdef EXT_INLINE_ASM
+  initial begin
+    rom = '{
+      __asm
+.arch riscv
+.org 0x8000
+.len 0x400
+
+; RISC-V test program - fill framebuffer with pattern
+; x1 = loop counter
+; x2 = RAM address
+; x3 = pattern value
+
+start:
+    lui x2, 0x0           ; x2 = 0x0 (framebuffer start)
+    addi x1, x0, 0        ; x1 = 0 (counter)
+    lui x4, 0x20          ; x4 = 0x10000 (0x10 << 12)
+
+loop:
+    add x3, x1, x0       ; x3 = counter value as pattern
+    sw x3, 0(x2)         ; Store pattern to framebuffer
+    addi x2, x2, 4       ; Increment address by 4 bytes
+    addi x1, x1, 1       ; Increment counter
+    blt x2, x4, loop     ; Loop if address < end
+
+    ; Infinite loop to restart
+    lui x2, 0x2          ; Reset to start
+    addi x1, x1, 1       ; Increment counter
+    jal x0, loop         ; Jump back to loop
+
+      __endasm
+    };
+  end
+`endif
+
+endmodule
+
@@ -1,7 +1,8 @@
 {
  "name":"riscv",
+  "width":32,
  "vars":{
-    "reg":{"bits":5, "toks":["zero","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15"]},
+    "reg":{"bits":5, "toks":["x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15"]},
    "brop":{"bits":3, "toks":["beq","bne","bx2","bx3","blt","bge","bltu","bgeu"]},
    "imm5":{"bits":5},
    "imm12":{"bits":12},
@@ -45,7 +45,10 @@ var VERILOG_PRESETS = [
  {id:'cpu16.v', name:'16-Bit CPU'},
  {id:'cpu_platform.v', name:'CPU Platform'},
  {id:'test2.asm', name:'16-bit ASM Game'},
-  {id:'cpu6502.v', name:'6502 CPU'},
+
+  {id:'cpu6502.v', name:'6502 CPU', category:'Extra'},
+  {id:'femtorv32.v', name:'RISC-V CPU'},
+  {id:'framebuffer_riscv.v', name:'RISC-V Frame Buffer'},

  {id:'test_pattern.ice', name:'Test Pattern', category:'Silice'},
  {id:'copperbars.ice', name:'Animated Bars'},
@@ -0,0 +1,596 @@
+import { describe, it } from "mocha";
+import assert from "assert";
+import { Assembler } from "../worker/assembler";
+
+describe('Assembler', function () {
+
+  describe('Basic Assembly', function () {
+    it('Should assemble simple 8-bit instructions', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          reg: { bits: 3, toks: ['r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7'] },
+          imm8: { bits: 8 }
+        },
+        rules: [
+          { fmt: 'nop', bits: ['00000000'] },
+          { fmt: 'mov ~reg,~reg', bits: ['10', 0, 1] },
+          { fmt: 'add ~reg,~imm8', bits: ['00001', 0, 1] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('nop');
+      asm.assemble('mov r1,r2');
+      asm.assemble('add r3,$42');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[0], 0x00, 'NOP should be 0x00');
+      assert.equal(state.output[1], 0b10001010, 'MOV r1,r2');
+      assert.equal(state.output[2], 0b00001011, 'ADD r3,imm first byte');
+      assert.equal(state.output[3], 0x42, 'ADD r3,imm second bytes');
+    });
+
+    it('Should handle labels', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm8: { bits: 8 }
+        },
+        rules: [
+          { fmt: 'jmp ~imm8', bits: ['11110000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('jmp target');
+      asm.assemble('nop: jmp nop');
+      asm.assemble('target: jmp target');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[0], 0xf0, 'JMP opcode');
+      assert.equal(state.output[1], 4, 'Should jump to address 4');
+      assert.equal(state.output[2], 0xf0, 'JMP opcode');
+      assert.equal(state.output[3], 2, 'Should jump to itself (address 2)');
+      assert.equal(state.output[4], 0xf0, 'JMP opcode');
+      assert.equal(state.output[5], 4, 'Should jump to itself (address 4)');
+    });
+  });
+
+  describe('PC-Relative Addressing', function () {
+    it('Should handle simple PC-relative branches', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          rel8: { bits: 8, iprel: true, ipofs: 0, ipmul: 1 }
+        },
+        rules: [
+          { fmt: 'br ~rel8', bits: ['10000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('br forward');   // offset 0
+      asm.assemble('br forward');   // offset 2
+      asm.assemble('br forward');   // offset 4
+      asm.assemble('forward: br forward'); // offset 6
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[1], 6, 'First branch offset should be 6');
+      assert.equal(state.output[3], 4, 'Second branch offset should be 4');
+      assert.equal(state.output[5], 2, 'Third branch offset should be 2');
+      assert.equal(state.output[7], 0, 'Fourth branch offset should be 0 (self)');
+    });
+
+    it('Should handle PC-relative with instruction multiplier', function () {
+      // Simulate word-addressed architecture where PC increments by 4
+      const spec = {
+        name: 'test32',
+        width: 32,
+        vars: {
+          rel13: { bits: 13, iprel: true, ipofs: 0, ipmul: 4 }
+        },
+        rules: [
+          { fmt: 'beq ~rel13', bits: ['1100011000000000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('beq target');   // offset 0
+      asm.assemble('beq target');   // offset 4
+      asm.assemble('target: beq target'); // offset 8
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      // PC-relative offset = (target - current) * ipmul
+      // First: (8 - 0) * 1 = 8
+      // Second: (8 - 4) * 1 = 4
+      // Third: (8 - 8) * 1 = 0
+      const first = state.output[0];
+      const second = state.output[1];
+      const third = state.output[2];
+
+      // Extract the 13-bit immediate from the instruction
+      // It's in the lower 13 bits
+      const offset1 = first & 0x1fff;
+      const offset2 = second & 0x1fff;
+      const offset3 = third & 0x1fff;
+
+      assert.equal(offset1, 8, 'First branch offset should be 8');
+      assert.equal(offset2, 4, 'Second branch offset should be 4');
+      assert.equal(offset3, 0, 'Third branch offset should be 0');
+    });
+  });
+
+  describe('Bit Slicing', function () {
+    it('Should extract bit slices correctly', function () {
+      // RISC-V style branch with scrambled immediate
+      const spec = {
+        name: 'riscv',
+        width: 32,
+        vars: {
+          reg: { bits: 5, toks: ['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] },
+          rel13: { bits: 13, iprel: true, ipofs: 0, ipmul: 1 }
+        },
+        rules: [
+          // beq rs1, rs2, offset
+          // Format: imm[12] | imm[10:5] | rs2 | rs1 | 000 | imm[4:1] | imm[11] | 1100011
+          {
+            fmt: 'beq ~reg,~reg,~rel13',
+            bits: [
+              { a: 2, b: 12, n: 1 },  // imm[12]
+              { a: 2, b: 5, n: 6 },   // imm[10:5]
+              1,                       // rs2
+              0,                       // rs1
+              '000',                   // funct3
+              { a: 2, b: 1, n: 4 },   // imm[4:1]
+              { a: 2, b: 11, n: 1 },  // imm[11]
+              '1100011'                // opcode
+            ]
+          }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('target: beq x1,x2,target'); // Self-branch with offset 0
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+
+      const insn = state.output[0];
+      // Decode the instruction to verify bit positions
+      const opcode = insn & 0x7f;
+      const imm_11 = (insn >> 7) & 1;
+      const imm_4_1 = (insn >> 8) & 0xf;
+      const funct3 = (insn >> 12) & 0x7;
+      const rs1 = (insn >> 15) & 0x1f;
+      const rs2 = (insn >> 20) & 0x1f;
+      const imm_10_5 = (insn >> 25) & 0x3f;
+      const imm_12 = (insn >> 31) & 1;
+
+      assert.equal(opcode, 0x63, 'Opcode should be 0x63 (branch)');
+      assert.equal(funct3, 0, 'funct3 should be 0 (BEQ)');
+      assert.equal(rs1, 1, 'rs1 should be 1 (x1)');
+      assert.equal(rs2, 2, 'rs2 should be 2 (x2)');
+
+      // All immediate bits should be 0 for self-branch
+      assert.equal(imm_12, 0, 'imm[12] should be 0');
+      assert.equal(imm_11, 0, 'imm[11] should be 0');
+      assert.equal(imm_10_5, 0, 'imm[10:5] should be 0');
+      assert.equal(imm_4_1, 0, 'imm[4:1] should be 0');
+    });
+
+    it('Should handle non-zero bit slice offsets', function () {
+      const spec = {
+        name: 'riscv',
+        width: 32,
+        vars: {
+          brop: { bits: 3, toks: ["beq","bne","bx2","bx3","blt","bge","bltu","bgeu"] },
+          reg: { bits: 5, toks: ['x0', 'x1', 'x2'] },
+          rel13: { bits: 13, iprel: true, ipofs: 0, ipmul: 4 }
+        },
+        rules: [
+          {
+            fmt: '~brop ~reg,~reg,~rel13',
+            bits: [
+              { a: 3, b: 12, n: 1 },
+              { a: 3, b: 5, n: 6 },
+              2,
+              1,
+              '000',
+              { a: 3, b: 1, n: 4 },
+              { a: 3, b: 11, n: 1 },
+              '1100011'
+            ]
+          }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 4096');
+      asm.assemble('.len 1024');
+      asm.assemble('beq x1,x2,target');  // offset 0
+      asm.assemble('beq x1,x2,target');  // offset 4
+      asm.assemble('beq x1,x2,target');  // offset 8
+      asm.assemble('target: beq x1,x2,target'); // offset 12 (self)
+      asm.assemble('beq x1,x2,target'); // offset 16
+      asm.assemble('beq x1,x2,target'); // offset 20
+
+/*
+00208663
+00208463
+00208263
+00208063
+fe208ee3
+fe208ce3
+*/
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[0], 0x208663, 'insn 0');
+      assert.equal(state.output[1], 0x208463, 'insn 1');
+      assert.equal(state.output[2], 0x208263, 'insn 2');
+      assert.equal(state.output[3], 0x208063, 'insn 3');
+      assert.equal(state.output[4], 0xfe208ee3|0, 'insn 4');
+      assert.equal(state.output[5], 0xfe208ce3|0, 'insn 5');
+
+      // Check that offset 12 was correctly calculated and sliced
+      const insn0 = state.output[0];
+
+      // Reconstruct the immediate from the instruction
+      const imm_11 = (insn0 >> 7) & 1;
+      const imm_4_1 = (insn0 >> 8) & 0xf;
+      const imm_10_5 = (insn0 >> 25) & 0x3f;
+      const imm_12 = (insn0 >> 31) & 1;
+
+      //console.log('insn0: $', insn0.toString(16), 'imm_12:', imm_12, 'imm_11:', imm_11, 'imm_10_5:', imm_10_5, 'imm_4_1:', imm_4_1);
+
+      // Reconstruct the 13-bit signed offset (bit 0 is implicit 0)
+      const offset = (imm_12 << 12) | (imm_11 << 11) | (imm_10_5 << 5) | (imm_4_1 << 1);
+
+      assert.equal(offset, 12, 'Offset should be 12 bytes');
+    });
+  });
+
+  describe('Endianness', function () {
+    it('Should handle little-endian values', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm16: { bits: 16, endian: 'little' as const }
+        },
+        rules: [
+          { fmt: 'ldi ~imm16', bits: ['10000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('ldi $1234');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[0], 0x80, 'Opcode');
+      assert.equal(state.output[1], 0x34, 'Low byte first (little-endian)');
+      assert.equal(state.output[2], 0x12, 'High byte second');
+    });
+
+    it('Should handle big-endian values', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm16: { bits: 16, endian: 'big' as const }
+        },
+        rules: [
+          { fmt: 'ldi ~imm16', bits: ['10000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('ldi $1234');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[0], 0x80, 'Opcode');
+      assert.equal(state.output[1], 0x12, 'High byte first (big-endian)');
+      assert.equal(state.output[2], 0x34, 'Low byte second');
+    });
+  });
+
+  describe('Directives', function () {
+    it('Should handle .org directive', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {},
+        rules: [
+          { fmt: 'nop', bits: ['00000000'] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 100');
+      asm.assemble('.len 256');
+      asm.assemble('nop');
+
+      const state = asm.finish();
+      assert.equal(state.origin, 100, 'Origin should be 100');
+      assert.equal(state.ip, 101, 'IP should be at 101');
+      assert.equal(state.output[0], 0x00, 'NOP at origin');
+    });
+
+    it('Should handle .data directive', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {},
+        rules: []
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('.data 10 20 $30');
+
+      const state = asm.finish();
+      assert.equal(state.output[0], 10);
+      assert.equal(state.output[1], 20);
+      assert.equal(state.output[2], 0x30);
+    });
+
+    it('Should handle .string directive', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {},
+        rules: []
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('.string HELLO');
+
+      const state = asm.finish();
+      assert.equal(state.output[0], 'H'.charCodeAt(0));
+      assert.equal(state.output[1], 'E'.charCodeAt(0));
+      assert.equal(state.output[2], 'L'.charCodeAt(0));
+      assert.equal(state.output[3], 'L'.charCodeAt(0));
+      assert.equal(state.output[4], 'O'.charCodeAt(0));
+    });
+
+    it('Should handle .align directive', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {},
+        rules: [
+          { fmt: 'nop', bits: ['00000000'] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('nop');        // offset 0
+      asm.assemble('nop');        // offset 1
+      asm.assemble('.align 4');   // align to 4
+      asm.assemble('nop');        // offset 4
+
+      const state = asm.finish();
+      assert.equal(state.lines[2].offset, 4, 'Should align to offset 4');
+    });
+  });
+
+  describe('Error Handling', function () {
+    it('Should detect undefined labels', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm8: { bits: 8 }
+        },
+        rules: [
+          { fmt: 'jmp ~imm8', bits: ['11110000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('jmp undefined_label');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 1, 'Should have one error');
+      assert(state.errors[0].msg.includes('undefined_label'), 'Error should mention undefined_label');
+    });
+
+    it('Should detect value overflow', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm4: { bits: 4 }
+        },
+        rules: [
+          { fmt: 'mov ~imm4', bits: ['1111', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('mov 20'); // 20 > 15 (max 4-bit value)
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 1, 'Should have one error');
+      assert(state.errors[0].msg.includes('does not fit'), 'Error should mention overflow');
+    });
+
+    it('Should detect invalid instructions', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {},
+        rules: [
+          { fmt: 'nop', bits: ['00000000'] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('invalid_instruction');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 1, 'Should have one error');
+      assert(state.errors[0].msg.includes('Could not decode'), 'Error should mention decode failure');
+    });
+  });
+
+  describe('32-bit Width', function () {
+    it('Should handle 32-bit instructions', function () {
+      const spec = {
+        name: 'test32',
+        width: 32,
+        vars: {
+          reg: { bits: 5, toks: ['r0', 'r1', 'r2', 'r3', 'r4'] },
+          imm12: { bits: 12 }
+        },
+        rules: [
+          // RISC-V ADDI format: imm[11:0] | rs1 | 000 | rd | 0010011
+          {
+            fmt: 'addi ~reg,~reg,~imm12',
+            bits: [2, 1, '000', 0, '0010011']
+          }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('addi r1,r2,$123');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+
+      const insn = state.output[0];
+      const opcode = insn & 0x7f;
+      const rd = (insn >> 7) & 0x1f;
+      const funct3 = (insn >> 12) & 0x7;
+      const rs1 = (insn >> 15) & 0x1f;
+      const imm = (insn >> 20) & 0xfff;
+
+      assert.equal(opcode, 0x13, 'Opcode should be 0x13 (OP-IMM)');
+      assert.equal(rd, 1, 'rd should be 1');
+      assert.equal(rs1, 2, 'rs1 should be 2');
+      assert.equal(funct3, 0, 'funct3 should be 0 (ADDI)');
+      assert.equal(imm, 0x123, 'Immediate should be 0x123');
+    });
+  });
+
+  describe('Symbol Definition', function () {
+    it('Should allow symbol definition with .define', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm8: { bits: 8 }
+        },
+        rules: [
+          { fmt: 'ldi ~imm8', bits: ['10000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('.define MYCONST 42');
+      asm.assemble('ldi MYCONST');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[1], 42, 'Should use defined constant');
+    });
+  });
+
+  describe('Complex Fixups', function () {
+    it('Should handle multiple fixups to same location', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          imm8: { bits: 8 }
+        },
+        rules: [
+          { fmt: 'jmp ~imm8', bits: ['11110000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('jmp target');
+      asm.assemble('jmp target');
+      asm.assemble('target: jmp target');
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[1], 4, 'First jump to target');
+      assert.equal(state.output[3], 4, 'Second jump to target');
+      assert.equal(state.output[5], 4, 'Third jump to itself');
+    });
+
+    it('Should handle backward references', function () {
+      const spec = {
+        name: 'test8',
+        width: 8,
+        vars: {
+          rel8: { bits: 8, iprel: true, ipofs: 0, ipmul: 1 }
+        },
+        rules: [
+          { fmt: 'br ~rel8', bits: ['10000000', 0] }
+        ]
+      };
+
+      const asm = new Assembler(spec);
+      asm.assemble('.org 0');
+      asm.assemble('.len 256');
+      asm.assemble('start: br forward');  // offset 0
+      asm.assemble('br start');           // offset 2 (backward)
+      asm.assemble('forward: br start');  // offset 4 (backward)
+
+      const state = asm.finish();
+      assert.equal(state.errors.length, 0, 'Should have no errors');
+      assert.equal(state.output[1], 4, 'Forward branch');
+
+      // Backward branches: offset = target - current
+      // For offset 2: target=0, current=2, offset=-2 (0xfe in 8-bit signed)
+      assert.equal(state.output[3] & 0xff, 0xfe, 'Backward branch from offset 2');
+
+      // For offset 4: target=0, current=4, offset=-4 (0xfc in 8-bit signed)
+      assert.equal(state.output[5] & 0xff, 0xfc, 'Backward branch from offset 4');
+    });
+  });
+});
@@ -7,7 +7,7 @@ type Symbol = {

 type AssemblerVar = {
  bits : number,
-  toks : string[],
+  toks? : string[],
  endian? : Endian,
  iprel? : boolean,
  ipofs? : number,
@@ -44,7 +44,9 @@ type AssemblerFixup = {
  iprel:boolean,
  ipofs:number,
  ipmul:number,
-  endian:Endian
+  endian:Endian,
+  rule:AssemblerRule,
+  m:string[]
 };

 type AssemblerSpec = {
@@ -135,6 +137,7 @@ export class Assembler {
    s = s.replace(/\(/g, '\\(');
    s = s.replace(/\)/g, '\\)');
    s = s.replace(/\./g, '\\.');
+    s = s.replace(/\,/g, '\\s*,\\s*'); // TODO?
    // TODO: more escapes?
    s = s.replace(/~\w+/g, (varname:string) => {
      varname = varname.substr(1);
@@ -245,12 +248,13 @@ export class Assembler {
    var oplen = 0;
    // iterate over each component of the rule output ("bits")
    for (let b of rule.bits) {
-      let n,x;
+      let nbits : number
+      let value : number;
      // is a string? then it's a bit constant
      // TODO
      if (typeof b === "string") {
-        n = b.length;
-        x = parseInt(b,2);
+        nbits = b.length;
+        value = parseInt(b,2);
      } else {
        // is it a slice {a,b,n} or just a number?
        var index = typeof b === "number" ? b : b.a;
@@ -260,44 +264,46 @@ export class Assembler {
        if (!v) {
          return {error:`Could not find matching identifier for '${m[0]}' index ${index}`};
        }
-        n = v.bits;
+        nbits = v.bits;
        var shift = 0;
        if (typeof b !== "number") {
-          n = b.n;
+          nbits = b.n;
          shift = b.b;
        }
        // is it an enumerated type? look up the index of its keyword
        if (v.toks) {
-          x = v.toks.indexOf(id);
-          if (x < 0)
+          value = v.toks.indexOf(id);
+          if (value < 0)
            return {error:"Can't use '" + id + "' here, only one of: " + v.toks.join(', ')};
        } else {
          // otherwise, parse it as a constant
-          x = this.parseConst(id, n);
+          value = this.parseConst(id, nbits);
          // is it a label? add fixup
-          if (isNaN(x)) {
+          if (isNaN(value)) {
            this.fixups.push({
              sym:id, ofs:this.ip, size:v.bits, line:this.linenum,
-              dstlen:n, dstofs:oplen, srcofs:shift,
+              dstlen:nbits, dstofs:oplen, srcofs:shift,
              endian:v.endian,
-              iprel:!!v.iprel, ipofs:(v.ipofs+0), ipmul:v.ipmul||1
+              iprel:!!v.iprel, ipofs:(v.ipofs||0), ipmul:v.ipmul||1,
+              rule, m
            });
-            x = 0;
+            //console.log(id, shift, oplen, nbits, v.bits);
+            value = 0;
          } else {
            var mask = (1<<v.bits)-1;
-            if ((x&mask) != x)
-              return {error:"Value " + x + " does not fit in " + v.bits + " bits"};
+            if ((value&mask) != value)
+              return {error:"Value " + value + " does not fit in " + v.bits + " bits"};
          }
        }
        // if little endian, we need to swap ordering
-        if (v.endian == 'little') x = this.swapEndian(x, v.bits);
+        if (v.endian == 'little') value = this.swapEndian(value, v.bits);
        // is it an array slice? slice the bits
        if (typeof b !== "number") {
-          x = (x >>> shift) & ((1 << b.n)-1);
+          value = (value >>> shift) & ((1 << b.n)-1);
        }
      }
-      opcode = (opcode << n) | x;
-      oplen += n;
+      opcode = (opcode << nbits) | value;
+      oplen += nbits;
    }
    if (oplen == 0)
      this.warning("Opcode had zero length");
@@ -385,35 +391,73 @@ export class Assembler {
  }

  applyFixup(fix: AssemblerFixup, sym: Symbol) {
-    var ofs = fix.ofs + Math.floor(fix.dstofs/this.width);
+    // Calculate the word offset where we'll apply this fixup
+    // fix.ofs is the instruction address, fix.dstofs is bit position within instruction
+    var ofs = fix.ofs + Math.floor(fix.dstofs / this.width);
+
+    // Create mask for the full symbol size (used for range checking)
    var mask = ((1<<fix.size)-1);
+
+    // Get the symbol's value (e.g., target address for a branch/jump)
    var value = this.parseConst(sym.value+"", fix.dstlen);
+
+    // Handle PC-relative addressing (branches, relative jumps)
+    // Converts absolute address to relative offset from current instruction
+    // value = (target - current_pc) * ipmul - ipofs
+    // - ipmul: multiplier for instruction units vs byte units (e.g., 4 for word-addressed)
+    // - ipofs: additional offset adjustment (e.g., for architectures with PC+offset)
    if (fix.iprel)
      value = (value - fix.ofs) * fix.ipmul - fix.ipofs;
+
+    // Range check: ensure value fits in the destination field
+    // Only check when not extracting a slice (srcofs == 0)
    if (fix.srcofs == 0 && (value > mask || value < -mask))
      this.warning("Symbol " + fix.sym + " (" + value + ") does not fit in " + fix.dstlen + " bits", fix.line);
+
    //console.log(hex(value,8), fix.srcofs, fix.dstofs, fix.dstlen);
+
+    // Extract bit slice if needed (e.g., bits [12:5] from a 13-bit immediate)
+    // srcofs is the starting bit position to extract from the value
    if (fix.srcofs > 0)
      value >>>= fix.srcofs;
+
+    // Mask to only the bits we want to insert (dstlen bits)
    value &= (1 << fix.dstlen) - 1;
+
+    // Position the value within the instruction word
+    // For 32-bit width: shift value left to align with destination bit position
+    // dstofs is counted from MSB, so we shift to put our bits in the right place
    // TODO: make it work for all widths
    if (this.width == 32) {
      var shift = 32 - fix.dstofs - fix.dstlen;
      value <<= shift;
    }
+
+    // Apply the fixup to the output
    // TODO: check range
+    console.log(fix, this.width);
    if (fix.size <= this.width) {
+      // Simple case: fixup fits in one word, just XOR it in
      this.outwords[ofs - this.origin] ^= value;
    } else {
+      // Complex case: multi-byte fixup (e.g., 32-bit immediate in 8-bit words)
      // swap if we want big endian (we'll apply in LSB first order)
      if (fix.endian == 'big') value = this.swapEndian(value, fix.size);
-      // apply multi-byte fixup
+
+      // Apply fixup across multiple words
      while (value) {
-        if (value & this.outwords[ofs - this.origin]) {
-          this.warning("Instruction bits overlapped: " + hex(this.outwords[ofs - this.origin],8), hex(value,8));
+        // Extract the low bits for this word
+        const v = value & ((1<<this.width)-1);
+
+        // Check for overlap (trying to set bits that are already set)
+        // TODO: check against mask
+        if (v & this.outwords[ofs - this.origin]) {
+          this.warning(`Instruction bits overlapped at bits ${fix.dstofs}:${fix.dstofs+fix.dstlen-1}: ${fix.rule.fmt} -> "${fix.sym}" ${hex(this.outwords[ofs - this.origin],8)} & ${hex(v,8)}`, fix.line);
        } else {
-          this.outwords[ofs - this.origin] ^= value & ((1<<this.width)-1);
+          this.outwords[ofs - this.origin] ^= v;
        }
+
+        // Move to next word
        value >>>= this.width;
        ofs++;
      }
@@ -95,7 +95,7 @@ function compileInlineASM(code: string, platform, options, errors, asmlines) {
                asmout.errors[i].line += firstline;
                errors.push(asmout.errors[i]);
            }
-            return "";
+            return '`error "inline assembly failed"';
        } else if (asmout.output) {
            let s = "";
            var out = asmout.output;