From c80be07f73dbc1332b7cf1445d701f69319060e3 Mon Sep 17 00:00:00 2001 From: Andy McFadden Date: Fri, 2 Nov 2018 13:49:27 -0700 Subject: [PATCH] Work around Merlin 32 instruction parsing bug The 2014-label-dp test now passes. Prior regression tests are unaffected. Also, renamed an IGenerator interface to more accurately reflect its role. (issue #37) --- SourceGen/AsmGen/AsmCc65.cs | 9 +- SourceGen/AsmGen/AsmMerlin32.cs | 28 +- SourceGen/AsmGen/AsmTass64.cs | 3 +- SourceGen/AsmGen/GenCommon.cs | 2 +- SourceGen/AsmGen/IGenerator.cs | 13 +- SourceGen/RuntimeData/Help/codegen.html | 23 +- .../Expected/2014-label-dp_Merlin32.S | 289 ++++++++++++++++++ SourceGen/Symbol.cs | 13 + 8 files changed, 356 insertions(+), 24 deletions(-) create mode 100644 SourceGen/SGTestData/Expected/2014-label-dp_Merlin32.S diff --git a/SourceGen/AsmGen/AsmCc65.cs b/SourceGen/AsmGen/AsmCc65.cs index db20bac..1819012 100644 --- a/SourceGen/AsmGen/AsmCc65.cs +++ b/SourceGen/AsmGen/AsmCc65.cs @@ -259,11 +259,12 @@ namespace SourceGen.AsmGen { } /// - /// Map the mnemonics we chose for undocumented opcodes to the cc65 mnemonics. - /// After switching to the Unintended Opcodes mnemonics there's almost no difference. + /// Map the undocumented opcodes to the cc65 mnemonics. There's almost no difference + /// vs. the Unintended Opcodes mnemonics. /// /// We don't include the double- and triple-byte NOPs here, as cc65 doesn't - /// appear to have a definition for them (as of 2.17). + /// appear to have a definition for them (as of 2.17). We also omit the alias + /// for SBC. These will all be output as hex. /// private static Dictionary sUndocMap = new Dictionary() { { OpName.ALR, "alr" }, // imm 0x4b @@ -288,7 +289,7 @@ namespace SourceGen.AsmGen { }; // IGenerator - public string ReplaceMnemonic(OpDef op) { + public string ModifyOpcode(int offset, OpDef op) { if ((op == OpDef.OpWDM_WDM || op == OpDef.OpBRK_StackInt) && mAsmVersion <= V2_17) { // cc65 v2.17 doesn't support WDM, and assembles BRK to opcode $05. // https://github.com/cc65/cc65/issues/715 diff --git a/SourceGen/AsmGen/AsmMerlin32.cs b/SourceGen/AsmGen/AsmMerlin32.cs index 9414c2f..0b0d2fe 100644 --- a/SourceGen/AsmGen/AsmMerlin32.cs +++ b/SourceGen/AsmGen/AsmMerlin32.cs @@ -328,12 +328,34 @@ namespace SourceGen.AsmGen { } // IGenerator - public string ReplaceMnemonic(OpDef op) { + public string ModifyOpcode(int offset, OpDef op) { if (op.IsUndocumented) { return null; - } else { - return string.Empty; } + + // The assembler works correctly if the symbol is defined as a two-digit hex + // value (e.g. "foo equ $80") but fails if it's four (e.g. "foo equ $0080"). We + // output symbols with minimal digits, but we have no control over labels when + // the code has a zero-page EQU. So if the operand is a reference to a user + // label, we need to output the instruction as hex. + if (op == OpDef.OpPEI_StackDPInd || + op == OpDef.OpSTY_DPIndexX || + op == OpDef.OpSTX_DPIndexY || + op.AddrMode == OpDef.AddressMode.DPIndLong || + op.AddrMode == OpDef.AddressMode.DPInd || + op.AddrMode == OpDef.AddressMode.DPIndexXInd) { + FormatDescriptor dfd = Project.GetAnattrib(offset).DataDescriptor; + if (dfd != null && dfd.HasSymbol) { + // It has a symbol. See if the symbol target is a label (auto or user). + if (Project.SymbolTable.TryGetValue(dfd.SymbolRef.Label, out Symbol sym)) { + if (sym.IsInternalLabel) { + return null; + } + } + } + } + + return string.Empty; } // IGenerator diff --git a/SourceGen/AsmGen/AsmTass64.cs b/SourceGen/AsmGen/AsmTass64.cs index 4c11f38..d4bb7aa 100644 --- a/SourceGen/AsmGen/AsmTass64.cs +++ b/SourceGen/AsmGen/AsmTass64.cs @@ -244,7 +244,6 @@ namespace SourceGen.AsmGen { } else if (cpuDef.Type == CpuDef.CpuType.Cpu6502 && cpuDef.HasUndocumented) { cpuStr = "6502i"; } else { - // 6502 def includes undocumented ops cpuStr = "6502"; } @@ -253,7 +252,7 @@ namespace SourceGen.AsmGen { } // IGenerator - public string ReplaceMnemonic(OpDef op) { + public string ModifyOpcode(int offset, OpDef op) { if (op.IsUndocumented) { if (Project.CpuDef.Type == CpuDef.CpuType.Cpu65C02) { // none of the "LDD" stuff is handled diff --git a/SourceGen/AsmGen/GenCommon.cs b/SourceGen/AsmGen/GenCommon.cs index 996b0e1..c3345e5 100644 --- a/SourceGen/AsmGen/GenCommon.cs +++ b/SourceGen/AsmGen/GenCommon.cs @@ -172,7 +172,6 @@ namespace SourceGen.AsmGen { wdis = OpDef.GetWidthDisambiguation(instrLen, operand); } - string replMnemonic = gen.ReplaceMnemonic(op); string opcodeStr = formatter.FormatOpcode(op, wdis); string formattedOperand = null; @@ -268,6 +267,7 @@ namespace SourceGen.AsmGen { } string commentStr = formatter.FormatEolComment(eolComment); + string replMnemonic = gen.ModifyOpcode(offset, op); if (attr.Length != instrBytes) { // This instruction has another instruction inside it. Throw out what we // computed and just output as bytes. diff --git a/SourceGen/AsmGen/IGenerator.cs b/SourceGen/AsmGen/IGenerator.cs index 27fae60..6d7afbe 100644 --- a/SourceGen/AsmGen/IGenerator.cs +++ b/SourceGen/AsmGen/IGenerator.cs @@ -85,14 +85,14 @@ namespace SourceGen.AsmGen { List GenerateSource(BackgroundWorker worker); /// - /// Provides an opportunity for the assembler to replace a mnemonic with another. This - /// is primarily intended for undocumented ops, which don't have standard mnemonics, - /// and hence can vary between assemblers. + /// Provides an opportunity for the assembler to replace a mnemonic with another, or + /// output an instruction as hex bytes. /// + /// Opcode offset. /// Opcode to replace. /// Replacement mnemonic, an empty string if the original is fine, or - /// null if the op is not supported at all and should be emitted as hex. - string ReplaceMnemonic(OpDef op); + /// null if the op is unsupported or broken and should be emitted as hex. + string ModifyOpcode(int offset, OpDef op); /// /// Generates an opcode/operand pair for a short sequence of bytes (1-4 bytes). @@ -163,6 +163,9 @@ namespace SourceGen.AsmGen { void OutputLine(string fullLine); } + /// + /// Enumeration of quirky or buggy behavior that GenCommon needs to handle. + /// public class AssemblerQuirks { /// /// Are the arguments to MVN/MVP reversed? diff --git a/SourceGen/RuntimeData/Help/codegen.html b/SourceGen/RuntimeData/Help/codegen.html index 1052a5f..71da80d 100644 --- a/SourceGen/RuntimeData/Help/codegen.html +++ b/SourceGen/RuntimeData/Help/codegen.html @@ -148,8 +148,8 @@ code, but also needs to know how to handle the corner cases.

as case-sensitive. The --case-sensitive must be passed to the assembler.
  • If you set the --case-sensitive flag, all opcodes - and operands must be lower-case. Most of the flags used to show - things in upper case must be disabled.
  • + and operands must be lower-case. Most of the SourceGen options used to + show things in upper case must be disabled.
  • For 65816, selecting the bank byte is done with the back-quote ('`') rather than the caret ('^'). (There's a note in the docs to the effect that they plan to move to carets.)
  • @@ -166,7 +166,7 @@ code, but also needs to know how to handle the corner cases.

  • PC relative branches don't wrap around at bank boundaries.
  • BRK <arg> is assembled to opcode $05 rather than $00.
  • WDM is not supported.
  • -
  • Source file names must not have spaces in them on Windows.
  • +
  • Source file names may not have spaces in them on Windows.
  • Quirks:

    @@ -181,10 +181,11 @@ code, but also needs to know how to handle the corner cases.

  • Undocumented opcodes: SBX ($cb) uses the mnemonic AXS. All other opcodes match up with the "unintended opcodes" document.
  • ca65 is implemented as a single-pass assembler, so label widths - can't always be known in time. For example, if you .ORG $0000 after - the point where the labels are used, the assembler will already have - generated them as absolute values. Width disambiguation must be applied - to instructions that aren't ambiguous to multi-pass assemblers.
  • + can't always be known in time. For example, if you use some zero-page + labels, but they're defined via .ORG $0000 after the point where the + labels are used, the assembler will already have generated them as + absolute values. Width disambiguation must be applied to operands + that wouldn't be ambiguous to a multi-pass assembler.
  • The assembler is geared toward generating relocatable code with multiple segments (it is, after all, an assembler for a C compiler). A linker script is expected to be provided for anything complex. Since @@ -201,8 +202,12 @@ code, but also needs to know how to handle the corner cases.

    • PC relative branches don't wrap around at bank boundaries.
    • For some failures, an exit code of zero is returned.
    • -
    • Some indexed store instructions cause errors if the label isn't - unambiguously DP (e.g. `STX $00,X` vs. `STX $0000,X`).
    • +
    • Some DP indexed store instructions cause errors if the label isn't + unambiguously DP (e.g. STX $00,X vs. + STX $0000,X). This isn't a problem with project/platform + symbols, which are output as two-digit hex values when possible, but + causes failures when direct page locations are included in the project + and given labels.

    Quirks:

    diff --git a/SourceGen/SGTestData/Expected/2014-label-dp_Merlin32.S b/SourceGen/SGTestData/Expected/2014-label-dp_Merlin32.S new file mode 100644 index 0000000..8f44ad3 --- /dev/null +++ b/SourceGen/SGTestData/Expected/2014-label-dp_Merlin32.S @@ -0,0 +1,289 @@ +;6502bench SourceGen v1.1.0-dev1 + org $1000 + sec + xce + jsr L101F + jsr L10AB + jsr L10F2 + jsr L1106 + jsr L1109 + jsr L112C + jsr L11F9 + jsr L11FC + nop + nop + nop + brk $80 + +L101F dfb $01,$80 + cop $80 + ora $80,S + tsb L0080 + ora L0080 + asl L0080 + dfb $07,$80 + php + ora #$80 + asl A + phd + tsb: L0086 + ora: L0086 + asl: L0086 + oral L0089 + bpl L1041 +L1041 ora (L0080),y + dfb $12,$80 + ora ($80,S),y + trb L0080 + ora L0080,x + asl L0080,x + ora [L0080],y + clc + ora L0086,y + inc A + tcs + trb: L0086 + ora: L0086,x + asl: L0086,x + oral L0089,x + jsr L0086 + dfb $21,$80 + jsl L0089 + and $80,S + bit L0080 + and L0080 + rol L0080 + dfb $27,$80 + plp + and #$80 + rol A + pld + bit: L0086 + and: L0086 + rol: L0086 + andl L0089 + bmi L1089 +L1089 and (L0080),y + dfb $32,$80 + and ($80,S),y + bit L0080,x + and L0080,x + rol L0080,x + and [L0080],y + sec + and L0086,y + dec A + tsc + bit: L0086,x + and: L0086,x + rol: L0086,x + andl L0089,x + rti + +L10AB dfb $41,$80 + wdm $80 + eor $80,S + mvp $84,$83 + eor L0080 + lsr L0080 + dfb $47,$80 + pha + eor #$80 + lsr A + phk + jmp L10C2 + +L10C2 eor: L0086 + lsr: L0086 + eorl L0089 + bvc L10CE +L10CE eor (L0080),y + dfb $52,$80 + eor ($80,S),y + mvn $84,$83 + eor L0080,x + lsr L0080,x + eor [L0080],y + cli + eor L0086,y + phy + tcd + jml L10E7 + +L10E7 eor: L0086,x + lsr: L0086,x + eorl L0089,x + rts + +L10F2 dfb $61,$80 + per $0ff6 + adc $80,S + stz L0080 + adc L0080 + ror L0080 + dfb $67,$80 + pla + adc #$80 + ror A + rtl + +L1106 jmp (L0086) + +L1109 adc: L0086 + ror: L0086 + adcl L0089 + bvs L1115 +L1115 adc (L0080),y + dfb $72,$80 + adc ($80,S),y + stz L0080,x + adc L0080,x + ror L0080,x + adc [L0080],y + sei + adc L0086,y + ply + tdc + jmp (L0086,x) + +L112C adc: L0086,x + ror: L0086,x + adcl L0089,x + bra L1138 + +L1138 dfb $81,$80 + brl L113D + +L113D sta $80,S + sty L0080 + sta L0080 + stx L0080 + dfb $87,$80 + dey + bit #$80 + txa + phb + sty: L0086 + sta: L0086 + stx: L0086 + stal L0089 + bcc L115B +L115B sta (L0080),y + dfb $92,$80 + sta ($80,S),y + dfb $94,$80 + sta L0080,x + dfb $96,$80 + sta [L0080],y + tya + sta L0086,y + txs + txy + stz: L0086 + sta: L0086,x + stz: L0086,x + stal L0089,x + ldy #$80 + dfb $a1,$80 + ldx #$80 + lda $80,S + ldy L0080 + lda L0080 + ldx L0080 + dfb $a7,$80 + tay + lda #$80 + tax + plb + ldy: L0086 + lda: L0086 + ldx: L0086 + ldal L0089 + bcs L11A0 +L11A0 lda (L0080),y + dfb $b2,$80 + lda ($80,S),y + ldy L0080,x + lda L0080,x + ldx L0080,y + lda [L0080],y + clv + lda L0086,y + tsx + tyx + ldy: L0086,x + lda: L0086,x + ldx: L0086,y + ldal L0089,x + cpy #$80 + dfb $c1,$80 + rep #$00 + cmp $80,S + cpy L0080 + cmp L0080 + dec L0080 + dfb $c7,$80 + iny + cmp #$80 + dex + wai + cpy: L0086 + cmp: L0086 + dec: L0086 + cmpl L0089 + bne L11E5 +L11E5 cmp (L0080),y + dfb $d2,$80 + cmp ($80,S),y + dfb $d4,$80 + cmp L0080,x + dec L0080,x + cmp [L0080],y + cld + cmp L0086,y + phx + stp + +L11F9 jml [L0086] + +L11FC cmp: L0086,x + dec: L0086,x + cmpl L0089,x + cpx #$80 + dfb $e1,$80 + sep #$00 + sbc $80,S + cpx L0080 + sbc L0080 + inc L0080 + dfb $e7,$80 + inx + sbc #$80 + nop + xba + cpx: L0086 + sbc: L0086 + inc: L0086 + sbcl L0089 + beq L122A +L122A sbc (L0080),y + dfb $f2,$80 + sbc ($80,S),y + pea L0086 + sbc L0080,x + inc L0080,x + sbc [L0080],y + sed + sbc L0086,y + plx + xce + jsr (L0086,x) + sbc: L0086,x + inc: L0086,x + sbcl L0089,x + org $0080 +L0080 bit L0082 +L0082 bit L0082 + bit L0082 +L0086 bit: L0086 +L0089 ldal L0089 diff --git a/SourceGen/Symbol.cs b/SourceGen/Symbol.cs index c4f5e58..3ec391d 100644 --- a/SourceGen/Symbol.cs +++ b/SourceGen/Symbol.cs @@ -48,6 +48,18 @@ namespace SourceGen { Constant // constant value } + /// + /// Returns true if the symbol's type is an internal label (auto or user). Returns + /// false for external addresses and constants. + /// + public bool IsInternalLabel { + get { + return SymbolType == Type.LocalOrGlobalAddr || + SymbolType == Type.GlobalAddr || + SymbolType == Type.GlobalAddrExport; + } + } + /// /// Label sent to assembler. @@ -74,6 +86,7 @@ namespace SourceGen { /// public string SourceTypeString { get; private set; } + // No nullary constructor. private Symbol() { }