From 6ed5d51260b529fe459acc9d361c7d1fc3a5979b Mon Sep 17 00:00:00 2001
From: Karol Stasiak <karol.m.stasiak@gmail.com>
Date: Sat, 3 Mar 2018 01:21:57 +0100
Subject: [PATCH] Preliminary support for 65816, 65CE02 and HuC6280

---
 CHANGELOG.md                                  |   4 +
 README.md                                     |   4 +
 doc/abi/undocumented.md                       |  11 +-
 doc/api/command-line.md                       |  26 +-
 doc/api/target-platforms.md                   |  27 +-
 doc/lang/assembly.md                          |  31 +-
 doc/lang/literals.md                          |   4 +
 include/c64_scpu.ini                          |  21 +
 .../scala/millfork/CompilationOptions.scala   |  64 ++-
 src/main/scala/millfork/Main.scala            |  34 +-
 .../scala/millfork/OptimizationPresets.scala  |   1 +
 src/main/scala/millfork/Platform.scala        |  45 +-
 .../millfork/assembly/AssemblyLine.scala      | 317 +++++++++++---
 src/main/scala/millfork/assembly/Opcode.scala | 190 ++++++++-
 .../opt/AlwaysGoodOptimizations.scala         |  29 +-
 .../assembly/opt/CE02Optimizations.scala      |  20 +
 .../opt/ChangeIndexRegisterOptimization.scala |  23 +-
 .../assembly/opt/CmosOptimizations.scala      |  33 +-
 .../assembly/opt/CoarseFlowAnalyzer.scala     | 103 ++++-
 .../millfork/assembly/opt/FlowAnalyzer.scala  |   2 +-
 .../assembly/opt/HudsonOptimizations.scala    |  21 +
 .../millfork/assembly/opt/LoopUnrolling.scala |   8 +-
 .../assembly/opt/ReverseFlowAnalyzer.scala    |  93 ++++-
 .../opt/RuleBasedAssemblyOptimization.scala   | 143 +++++--
 .../assembly/opt/SixteenOptimizations.scala   | 195 +++++++++
 .../opt/VariableToRegisterOptimization.scala  | 390 ++++++++++++++----
 .../scala/millfork/compiler/BuiltIns.scala    | 127 ++++++
 .../compiler/ExpressionCompiler.scala         |  53 ++-
 .../millfork/compiler/StatementCompiler.scala |   2 +-
 src/main/scala/millfork/env/Thing.scala       |  39 +-
 src/main/scala/millfork/node/Node.scala       |   2 +-
 .../scala/millfork/output/Assembler.scala     | 230 +++++++++--
 .../millfork/output/InliningCalculator.scala  |   5 +-
 src/main/scala/millfork/parser/MfParser.scala |  42 +-
 .../scala/millfork/test/StackVarSuite.scala   |  14 +-
 .../scala/millfork/test/WordMathSuite.scala   |  18 +-
 .../test/emu/EmuCmosBenchmarkRun.scala        |   9 +
 .../test/emu/EmuOptimized65816Run.scala       |  21 +
 .../test/emu/EmuOptimized65CE02Run.scala      |  19 +
 .../test/emu/EmuOptimizedHudsonRun.scala      |  19 +
 src/test/scala/millfork/test/emu/EmuRun.scala |  10 +-
 .../millfork/test/emu/SymonTestRam.scala      |   3 +-
 42 files changed, 2119 insertions(+), 333 deletions(-)
 create mode 100644 include/c64_scpu.ini
 create mode 100644 src/main/scala/millfork/assembly/opt/CE02Optimizations.scala
 create mode 100644 src/main/scala/millfork/assembly/opt/HudsonOptimizations.scala
 create mode 100644 src/main/scala/millfork/assembly/opt/SixteenOptimizations.scala
 create mode 100644 src/test/scala/millfork/test/emu/EmuOptimized65816Run.scala
 create mode 100644 src/test/scala/millfork/test/emu/EmuOptimized65CE02Run.scala
 create mode 100644 src/test/scala/millfork/test/emu/EmuOptimizedHudsonRun.scala

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9939e508..4c3c3221 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 * **Breaking change!** Renamed `inline` to `macro`.
 
+* Added preliminary support for 65CE02, HuC6280 and 65816 processors.
+
 * Added new `-O1` optimization preset; old `-O1` became `-O2`, old `-O2` became `-O3` and so on.
 
 * Added support for parameters for macros written in Millfork.
@@ -18,6 +20,8 @@
 
 * Added return dispatch statements.
 
+* Added octal and quaternary literals.
+
 * Fixed several optimization bugs.
 
 * Fixed several C64 library bugs.
diff --git a/README.md b/README.md
index 819703ff..d6ef2c86 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,8 @@ For binary releases, see: https://github.com/KarolS/millfork/releases (latest: 0
 
     * Commodore 64 (the primary target)
     
+    * Commodore 64 with SuperCPU (experimental, incomplete and very buggy)
+    
     * Commodore 16 and Plus/4
     
     * Commodore 128
@@ -47,3 +49,5 @@ For binary releases, see: https://github.com/KarolS/millfork/releases (latest: 0
 * multi-part programs
 
 * more targets: Famicon/NES, BBC Micro/Electron, Oric computers, PC-Engine/Turbografx-16, Atari Lynx
+ 
+* support for 65816, targetting SuperCPU, SuperFamicom/SNES and Apple IIgs 
diff --git a/doc/abi/undocumented.md b/doc/abi/undocumented.md
index 5db05b70..8a5c1e6e 100644
--- a/doc/abi/undocumented.md
+++ b/doc/abi/undocumented.md
@@ -30,13 +30,13 @@ Millfork supports multiple mnemonics per opcode. The default one is given first:
 
 * **RRA**
 
-* **SAX**
+* **SAX**\*
 
 * **SHX**, XAS
 
-* **SHY**, SAY
+* **SHY**, SAY\*
 
-* **SBX**, AXS\*
+* **SBX**, AXS\*\*
 
 * **SRE**, LSE
 
@@ -46,7 +46,10 @@ Millfork supports multiple mnemonics per opcode. The default one is given first:
 
 * **XAA**, ANE
 
-\* AXS is also used for SAX in some assemblers, but Millfork always interprets AXS as a synonym for SBX
+\* HuC2680 has different instructions also called SAX and SAY, 
+but Millfork can distinguish between them and the NMOS illegal instructions based on the addressing mode.
+
+\*\* AXS is also used for SAX in some assemblers. Millfork interprets AXS based on the addressing mode.
 
 ## Generation
 
diff --git a/doc/api/command-line.md b/doc/api/command-line.md
index 317c92e9..baadc554 100644
--- a/doc/api/command-line.md
+++ b/doc/api/command-line.md
@@ -32,16 +32,34 @@
 ## Code generation options
 
 * `-fcmos-ops`, `-fno-cmos-ops` – Whether should emit CMOS opcodes.  
-`.ini` equivalent: `emit_cmos`. Default: yes if targeting 65C02, no otherwise.
+`.ini` equivalent: `emit_cmos`.
+Default: yes if targeting a 65C02-compatible architecture, no otherwise.
 
 * `-fillegals`, `-fno-illegals` – Whether should emit illegal (undocumented) NMOS opcodes.  
-`.ini` equivalent: `emit_illegals`. Default: no.
+`.ini` equivalent: `emit_illegals`.
+Default: no.
+
+* `-f65ce02-ops`, `-fno-65ce02-ops` – Whether should emit 65CE02 opcodes.  
+`.ini` equivalent: `emit_65ce026`. 
+Default: yes if targeting 65CE02, no otherwise.
+
+* `-fhuc6280-ops`, `-fno-huc6280-ops` – Whether should emit HuC6280 opcodes.  
+`.ini` equivalent: `emit_huc6280`. 
+Default: yes if targeting HuC6280, no otherwise.
+
+* `-fno-65816-ops`, `-femulation-65816-ops`, `-fnative-65816-ops` – Which subset of 65816 instructions to support. 
+`-fnative-65816-ops` is required to use any 16-bit operations. 
+Currently, there is not much support in the compiler for the native mode.
+`.ini` equivalent: `emit_65816`. 
+Default: native if targeting 65816, no otherwise.
 
 * `-fjmp-fix`, `-fno-jmp-fix` – Whether should prevent indirect JMP bug on page boundary.  
-`.ini` equivalent: `prevent_jmp_indirect_bug`. Default: no if targeting 65C02, yes otherwise.
+`.ini` equivalent: `prevent_jmp_indirect_bug`.
+Default: no if targeting a 65C02-compatible architecture, yes otherwise.
                     
 * `-fdecimal-mode`, `-fno-decimal-mode` – Whether decimal mode should be available.  
-`.ini` equivalent: `decimal_mode`. Default: no if targeting Ricoh, yes otherwise.
+`.ini` equivalent: `decimal_mode`.
+Default: no if targeting Ricoh, yes otherwise.
 
 * `-fvariable-overlap`, `-fno-variable-overlap` – Whether variables should overlap if their scopes do not intersect.  
 Default: yes.
diff --git a/doc/api/target-platforms.md b/doc/api/target-platforms.md
index d90bd1e5..a1a41905 100644
--- a/doc/api/target-platforms.md
+++ b/doc/api/target-platforms.md
@@ -1,6 +1,6 @@
 # Target platforms
 
-Currently, Millfork supports creating disk- or tape-based programs for Commodore and Atari 8-bit computers, 
+Currently, Millfork supports creating disk- or tape-based programs for Commodore, Apple and Atari 8-bit computers, 
 but it may be expanded to support other 6502-based platforms in the future.
 
 ## Supported platforms
@@ -9,6 +9,8 @@ The following platforms are currently supported:
 
 * `c64` – Commodore 64
 
+* `c64_scpu` – Commodore 64 with SuperCPU (very buggy)
+
 * `c16` – Commodore 16
 
 * `plus4` – Commodore Plus/4
@@ -57,15 +59,21 @@ Every platform is defined in an `.ini` file with an appropriate name.
 
 * `arch` – CPU architecture. It defines which instructions are available. Available values: 
 
-    * `nmos`
+    * `nmos` (original 6502)
     
-    * `strict` (= NMOS without illegal instructions) 
+    * `strict` (NMOS without illegal instructions) 
     
-    * `ricoh` (= NMOS without decimal mode) 
+    * `ricoh` (Ricoh 2A03/2A07, NMOS without decimal mode) 
     
-    * `strictricoh`
+    * `strictricoh` (Ricoh 2A03/2A07 without illegal instructions)
     
-    * `cmos` (= 65C02)
+    * `cmos` (WDC 65C02 or 65SC02)
+    
+    * `65ce02` (CSG 65CE02; experimental)
+    
+    * `huc6280` (Hudson HuC6280; experimental)
+    
+    * `65816` (WDC 65816/65802; experimental; currently only programs that use only 16-bit addressing are supported)
 
 * `modules` – comma-separated list of modules that will be automatically imported
 
@@ -73,13 +81,16 @@ Every platform is defined in an `.ini` file with an appropriate name.
 
     * `emit_illegals` – whether the compiler should emit illegal instructions, default `false`
     
-    * `emit_cmos` – whether the compiler should emit CMOS instructions, default is `true` on `cmos` and `false` elsewhere
+    * `emit_cmos` – whether the compiler should emit CMOS instructions, default is `true` on compatible processors and `false` elsewhere
+
+    * `emit_65816` – which 65816 instructions should the compiler emit, either `no`, `emulation` or `native` 
     
     * `decimal_mode` – whether the compiler should emit decimal instructions, default is `false` on `ricoh` and `strictricoh` and `true` elsewhere
     
     * `ro_arrays` – whether the compiler should warn upon array writes, default is `false`
     
-    * `prevent_jmp_indirect_bug` – whether the compiler should try to avoid the indirect JMP bug, default is `false` on `cmos` and `true` elsewhere
+    * `prevent_jmp_indirect_bug` – whether the compiler should try to avoid the indirect JMP bug, 
+    default is `false` on 65C02-compatible processors and `true` elsewhere
 
 #### `[allocation]` section
 
diff --git a/doc/lang/assembly.md b/doc/lang/assembly.md
index a1d88299..f499d164 100644
--- a/doc/lang/assembly.md
+++ b/doc/lang/assembly.md
@@ -11,7 +11,8 @@ There are two ways to include raw assembly code in your Millfork programs:
 Millfork inline assembly uses the same three-letter opcodes as most other 6502 assemblers.
 Indexing syntax is also the same. Only instructions available on the current CPU architecture are available.
 
-Currently, `RMBx`/`SMBx`/`BBRx`/`BBSx` are not supported yet.
+**Work in progress**: 
+Currently, `RMBx`/`SMBx`/`BBRx`/`BBSx` and some extra 65CE02/HuC6280/65816 instructions are not supported yet.
 
 Undocumented instructions are supported using various opcodes
 
@@ -136,4 +137,32 @@ it should abide to the following rules:
 
 * end non-inline assembly functions with `RTS`, `JMP` or `RTI` as appropriate
 
+* on NMOS 6502:
+
+    * don't use `XAA`, `LXA`, `AHX`, `SHX`, `SHY`, `LAS` and `TAS` instructions
+
+* on 65816:
+
+    * keep the direct page register set to $0000
+    
+    * keep the M and X flags set to 1 (8-bit registers by default, native mode) 
+    
+    * if running in the native mode, be careful with the stack pointer (you should keep it between $000100 and $0001FF)
+    
+    * do not change the data page register (keep an eye at the `PLD`, `MVN`, `MVP` instructions)
+    
+    * explicitly use 16-bit immediate operands when appropriate; the assembler doesn't track flags and assumes 8-bit immediates by default
+    
+    * use far jumps unless you're sure that the called function returns with an `RTS`  
+    
+* on 65CE02:
+
+    * keep the `B` register set to $00
+    
+    * don't change the `E` flag
+    
+* on HuC6280
+
+    * don't use the `SET` instruction
+
 The above list is not exhaustive.
diff --git a/doc/lang/literals.md b/doc/lang/literals.md
index b0d344db..d340df8c 100644
--- a/doc/lang/literals.md
+++ b/doc/lang/literals.md
@@ -6,6 +6,10 @@ Decimal: `1`, `10`
 
 Binary: `%0101`, `0b101001`
 
+Quaternary: `0q2131`
+
+Octal: `0o172`
+
 Hexadecimal: `$D323`, `0x2a2`
 
 ## String literals
diff --git a/include/c64_scpu.ini b/include/c64_scpu.ini
new file mode 100644
index 00000000..4029e560
--- /dev/null
+++ b/include/c64_scpu.ini
@@ -0,0 +1,21 @@
+; Commodore 64 with a SuperCPU
+; assuming a program loaded from disk or tape
+
+[compilation]
+arch=65816
+modules=c64_hardware,loader_0801,c64_kernal,c64_panic,stdlib
+emit_65816=emulation
+
+[allocation]
+main_org=$80D
+zp_pointers=$C1,$C3,$FB,$FD,$39,$3B,$3D,$43,$45,$47,$4B
+himem_style=per_bank
+himem_start=after_code
+himem_end=$9FFF
+
+[output]
+style=per_bank
+format=startaddr,allocated
+extension=prg
+
+
diff --git a/src/main/scala/millfork/CompilationOptions.scala b/src/main/scala/millfork/CompilationOptions.scala
index ce80ef56..3153685b 100644
--- a/src/main/scala/millfork/CompilationOptions.scala
+++ b/src/main/scala/millfork/CompilationOptions.scala
@@ -21,17 +21,46 @@ case class CompilationOptions(platform: Platform, commandLineFlags: Map[Compilat
       ErrorReporting.warn("Decimal mode enabled for Ricoh architecture", this)
     }
   }
-  if (platform.cpu != Cmos) {
+  if (platform.cpu == Sixteen) {
+    if (flags(LargeCode)) {
+      ErrorReporting.warn("Large code model doesn't work correctly yet", this)
+    }
+  }
+  if (platform.cpu != Sixteen) {
+    if (flags(LargeCode)) {
+      ErrorReporting.error("Cannot use large code model on architectures other than 65816")
+    }
+    if (flags(ReturnWordsViaAccumulator)) {
+      ErrorReporting.error("Cannot return words via accumulator on non-65816 architecture")
+    }
+    if (flags(EmitNative65816Opcodes) || flags(EmitEmulation65816Opcodes)) {
+      ErrorReporting.error("65816 opcodes enabled for non-65816 architecture")
+    }
+  }
+  if (platform.cpu != CE02) {
+    if (flags(Emit65CE02Opcodes)) {
+      ErrorReporting.error("65CE02 opcodes enabled for non-65CE02 architecture")
+    }
+  }
+  if (flags(Emit65CE02Opcodes)) {
+    ErrorReporting.warn("65CE02 opcodes are highly experimental", this)
+  }
+  if (platform.cpu != HuC6280) {
+    if (flags(EmitHudsonOpcodes)) {
+      ErrorReporting.error("HuC6280 opcodes enabled for non-HuC6280 architecture")
+    }
+  }
+  if (!CmosCompatible(platform.cpu)) {
     if (!flags(PreventJmpIndirectBug)) {
       ErrorReporting.warn("JMP bug prevention should be enabled for non-CMOS architecture", this)
     }
     if (flags(EmitCmosOpcodes)) {
-      ErrorReporting.warn("CMOS opcodes enabled for non-CMOS architecture", this)
+      ErrorReporting.error("CMOS opcodes enabled for non-CMOS architecture")
     }
   }
   if (flags(EmitIllegals)) {
-    if (platform.cpu == Cmos) {
-      ErrorReporting.warn("Illegal opcodes enabled for CMOS architecture", this)
+    if (CmosCompatible(platform.cpu)) {
+      ErrorReporting.error("Illegal opcodes enabled for architecture that doesn't support them")
     }
     if (platform.cpu == StrictRicoh || platform.cpu == StrictMos) {
       ErrorReporting.warn("Illegal opcodes enabled for strict architecture", this)
@@ -41,7 +70,9 @@ case class CompilationOptions(platform: Platform, commandLineFlags: Map[Compilat
 
 object Cpu extends Enumeration {
 
-  val Mos, StrictMos, Ricoh, StrictRicoh, Cmos = Value
+  val Mos, StrictMos, Ricoh, StrictRicoh, Cmos, HuC6280, CE02, Sixteen = Value
+
+  val CmosCompatible = Set(Cmos, HuC6280, CE02, Sixteen)
 
   import CompilationFlag._
 
@@ -50,7 +81,10 @@ object Cpu extends Enumeration {
     case Mos => Set(DecimalMode, PreventJmpIndirectBug, VariableOverlap, CompactReturnDispatchParams)
     case Ricoh => Set(PreventJmpIndirectBug, VariableOverlap, CompactReturnDispatchParams)
     case StrictRicoh => Set(PreventJmpIndirectBug, VariableOverlap, CompactReturnDispatchParams)
-    case Cmos => Set(EmitCmosOpcodes, VariableOverlap, CompactReturnDispatchParams)
+    case Cmos => Set(DecimalMode, EmitCmosOpcodes, VariableOverlap, CompactReturnDispatchParams)
+    case HuC6280 => Set(DecimalMode, EmitCmosOpcodes, EmitHudsonOpcodes, VariableOverlap, CompactReturnDispatchParams)
+    case CE02 => Set(DecimalMode, EmitCmosOpcodes, Emit65CE02Opcodes, VariableOverlap, CompactReturnDispatchParams)
+    case Sixteen => Set(DecimalMode, EmitCmosOpcodes, EmitEmulation65816Opcodes, EmitNative65816Opcodes, ReturnWordsViaAccumulator, VariableOverlap, CompactReturnDispatchParams)
   }
 
   def fromString(name: String): Cpu.Value = name match {
@@ -59,21 +93,33 @@ object Cpu extends Enumeration {
     case "6510" => Mos
     case "strict" => StrictMos
     case "cmos" => Cmos
+    case "65sc02" => Cmos
+    case "sc02" => Cmos
     case "65c02" => Cmos
+    case "c02" => Cmos
+    case "hudson" => HuC6280
+    case "huc6280" => HuC6280
+    case "c6280" => HuC6280
+    case "6280" => HuC6280
+    case "65ce02" => CE02
+    case "ce02" => CE02
+    case "65816" => Sixteen
+    case "816" => Sixteen
     case "ricoh" => Ricoh
     case "2a03" => Ricoh
     case "2a07" => Ricoh
     case "strictricoh" => StrictRicoh
     case "strict2a03" => StrictRicoh
     case "strict2a07" => StrictRicoh
-    case _ => ErrorReporting.fatal("Unknown CPU achitecture")
+    case _ => ErrorReporting.fatal("Unknown CPU achitecture: " + name)
   }
 }
 
 object CompilationFlag extends Enumeration {
   val
   // compilation options:
-  EmitIllegals, EmitCmosOpcodes, DecimalMode, ReadOnlyArrays, PreventJmpIndirectBug,
+  EmitIllegals, EmitCmosOpcodes, EmitCmosNopOpcodes, EmitHudsonOpcodes, Emit65CE02Opcodes, EmitEmulation65816Opcodes, EmitNative65816Opcodes,
+  DecimalMode, ReadOnlyArrays, PreventJmpIndirectBug, LargeCode, ReturnWordsViaAccumulator,
   // optimization options:
   DetailedFlowAnalysis, DangerousOptimizations, InlineFunctions, OptimizeForSize, OptimizeForSpeed, OptimizeForSonicSpeed,
   // memory allocation options
@@ -90,6 +136,8 @@ object CompilationFlag extends Enumeration {
   val fromString = Map(
     "emit_illegals" -> EmitIllegals,
     "emit_cmos" -> EmitCmosOpcodes,
+    "emit_65ce02" -> Emit65CE02Opcodes,
+    "emit_huc6280" -> EmitHudsonOpcodes,
     "decimal_mode" -> DecimalMode,
     "ro_arrays" -> ReadOnlyArrays,
     "ror_warn" -> RorWarning,
diff --git a/src/main/scala/millfork/Main.scala b/src/main/scala/millfork/Main.scala
index f4aea1a6..f673e2a5 100644
--- a/src/main/scala/millfork/Main.scala
+++ b/src/main/scala/millfork/Main.scala
@@ -65,7 +65,10 @@ object Main {
       "c64"
     })
     val options = CompilationOptions(platform, c.flags)
-    ErrorReporting.debug("Effective flags: " + options.flags)
+    ErrorReporting.debug("Effective flags: ")
+    options.flags.toSeq.sortBy(_._1).foreach{
+      case (f, b) => ErrorReporting.debug(f"    $f%-30s : $b%s")
+    }
 
     val output = c.outputFileName.getOrElse("a")
     val assOutput = output + ".asm"
@@ -94,7 +97,11 @@ object Main {
       case _ =>
         val extras = List(
           if (options.flag(CompilationFlag.EmitIllegals)) UndocumentedOptimizations.All else Nil,
+          if (options.flag(CompilationFlag.Emit65CE02Opcodes)) CE02Optimizations.All else Nil,
           if (options.flag(CompilationFlag.EmitCmosOpcodes)) CmosOptimizations.All else LaterOptimizations.Nmos,
+          if (options.flag(CompilationFlag.EmitHudsonOpcodes)) HudsonOptimizations.All else Nil,
+          if (options.flag(CompilationFlag.EmitEmulation65816Opcodes)) SixteenOptimizations.AllForEmulation else Nil,
+          if (options.flag(CompilationFlag.EmitEmulation65816Opcodes)) SixteenOptimizations.AllForNative else Nil,
           if (options.flag(CompilationFlag.DangerousOptimizations)) DangerousOptimizations.All else Nil,
         ).flatten
         val goodCycle = List.fill(optLevel - 2)(OptimizationPresets.Good).flatten
@@ -191,9 +198,32 @@ object Main {
     boolean("-fcmos-ops", "-fno-cmos-ops").action { (c, v) =>
       c.changeFlag(CompilationFlag.EmitCmosOpcodes, v)
     }.description("Whether should emit CMOS opcodes.")
+    boolean("-f65ce02-ops", "-fno-65ce02-ops").action { (c, v) =>
+      c.changeFlag(CompilationFlag.Emit65CE02Opcodes, v)
+    }.description("Whether should emit 65CE02 opcodes.")
+    boolean("-fhuc6280-ops", "-fno-huc6280-ops").action { (c, v) =>
+      c.changeFlag(CompilationFlag.EmitHudsonOpcodes, v)
+    }.description("Whether should emit HuC6280huc6280 opcodes.")
+    flag("-fno-65816-ops").action { c =>
+      c.changeFlag(CompilationFlag.EmitEmulation65816Opcodes, b = false)
+      c.changeFlag(CompilationFlag.EmitNative65816Opcodes, b = false)
+      c.changeFlag(CompilationFlag.ReturnWordsViaAccumulator, b = false)
+    }.description("Don't emit 65816 opcodes.")
+    flag("-femulation-65816-ops").action { c =>
+      c.changeFlag(CompilationFlag.EmitEmulation65816Opcodes, b = true)
+      c.changeFlag(CompilationFlag.EmitNative65816Opcodes, b = false)
+      c.changeFlag(CompilationFlag.ReturnWordsViaAccumulator, b = false)
+    }.description("Emit 65816 opcodes (experimental).")
+    flag("-fnative-65816-ops").action { c =>
+      c.changeFlag(CompilationFlag.EmitEmulation65816Opcodes, b = true)
+      c.changeFlag(CompilationFlag.EmitNative65816Opcodes, b = true)
+    }.description("Emit 65816 opcodes (experimental).")
+    boolean("-flarge-code", "-fsmall-code").action { (c, v) =>
+      c.changeFlag(CompilationFlag.LargeCode, v)
+    }.description("Whether should use 24-bit or 16-bit jumps to subroutines (not yet implemented).").hidden()
     boolean("-fillegals", "-fno-illegals").action { (c, v) =>
       c.changeFlag(CompilationFlag.EmitIllegals, v)
-    }.description("Whether should emit illegal (undocumented) NMOS opcodes. Required -O2 or higher to have an effect.")
+    }.description("Whether should emit illegal (undocumented) NMOS opcodes. Requires -O2 or higher to have an effect.")
     boolean("-fjmp-fix", "-fno-jmp-fix").action { (c, v) =>
       c.changeFlag(CompilationFlag.PreventJmpIndirectBug, v)
     }.description("Whether should prevent indirect JMP bug on page boundary.")
diff --git a/src/main/scala/millfork/OptimizationPresets.scala b/src/main/scala/millfork/OptimizationPresets.scala
index 8128235a..252dd52a 100644
--- a/src/main/scala/millfork/OptimizationPresets.scala
+++ b/src/main/scala/millfork/OptimizationPresets.scala
@@ -173,6 +173,7 @@ object OptimizationPresets {
     AlwaysGoodOptimizations.ReverseFlowAnalysis,
     AlwaysGoodOptimizations.SimplifiableBitOpsSequence,
     AlwaysGoodOptimizations.SimplifiableCondition,
+    AlwaysGoodOptimizations.SimplifiableStackOperation,
     AlwaysGoodOptimizations.SmarterShiftingOfWords,
     AlwaysGoodOptimizations.SmarterShiftingBytes,
     AlwaysGoodOptimizations.UnconditionalJumpRemoval,
diff --git a/src/main/scala/millfork/Platform.scala b/src/main/scala/millfork/Platform.scala
index 3e26588b..49bca854 100644
--- a/src/main/scala/millfork/Platform.scala
+++ b/src/main/scala/millfork/Platform.scala
@@ -20,6 +20,7 @@ class Platform(
                 val allocator: VariableAllocator,
                 val org: Int,
                 val fileExtension: String,
+                var defaultCodeBank: Int = 0,
               )
 
 object Platform {
@@ -55,11 +56,35 @@ object Platform {
 
     val cs = conf.getSection("compilation")
     val cpu = Cpu.fromString(cs.get(classOf[String], "arch", "strict"))
-    val flagOverrides = CompilationFlag.fromString.flatMap { case (k, f) =>
-      cs.get(classOf[String], k, "").toLowerCase match {
+    val value65816 = cs.get(classOf[String], "emit_65816", "")
+    val flagOverrides = (value65816.toLowerCase match {
+      case "" => Nil
+      case "false" | "none" | "no" | "off" | "0" =>
+        List(
+          CompilationFlag.EmitEmulation65816Opcodes -> false,
+          CompilationFlag.EmitNative65816Opcodes -> false,
+          CompilationFlag.ReturnWordsViaAccumulator -> false)
+      case "emulation" =>
+        List(
+          CompilationFlag.EmitEmulation65816Opcodes -> true,
+          CompilationFlag.EmitNative65816Opcodes -> false,
+          CompilationFlag.ReturnWordsViaAccumulator -> false)
+      case "native" =>
+        List(
+          CompilationFlag.EmitEmulation65816Opcodes -> true,
+          CompilationFlag.EmitNative65816Opcodes -> true)
+      case _ =>
+        ErrorReporting.error(s"Unsupported `emit_65816` value: $value65816")
+        Nil
+    }).toMap ++ CompilationFlag.fromString.flatMap { case (k, f) =>
+      val value = cs.get(classOf[String], k, "")
+      value.toLowerCase match {
         case "" => None
-        case "false" | "off" | "0" => Some(f -> false)
-        case "true" | "on" | "1" => Some(f -> true)
+        case "false" | "off" | "no" | "0" => Some(f -> false)
+        case "true" | "on" | "yes" | "1" => Some(f -> true)
+        case _ =>
+          ErrorReporting.error(s"Unsupported `$k` value: $value")
+          None
       }
     }
     val startingModules = cs.get(classOf[String], "modules", "").split("[, ]+").filter(_.nonEmpty).toList
@@ -104,10 +129,22 @@ object Platform {
       Integer.parseInt(s.substring(1), 16)
     } else if (s.startsWith("0x")) {
       Integer.parseInt(s.substring(2), 16)
+    } else if (s.startsWith("0X")) {
+      Integer.parseInt(s.substring(2), 16)
     } else if (s.startsWith("%")) {
       Integer.parseInt(s.substring(1), 2)
     } else if (s.startsWith("0b")) {
       Integer.parseInt(s.substring(2), 2)
+    } else if (s.startsWith("0B")) {
+      Integer.parseInt(s.substring(2), 2)
+    } else if (s.startsWith("0o")) {
+      Integer.parseInt(s.substring(2), 8)
+    } else if (s.startsWith("0O")) {
+      Integer.parseInt(s.substring(2), 8)
+    } else if (s.startsWith("0q")) {
+      Integer.parseInt(s.substring(2), 4)
+    } else if (s.startsWith("0Q")) {
+      Integer.parseInt(s.substring(2), 4)
     } else {
       s.toInt
     }
diff --git a/src/main/scala/millfork/assembly/AssemblyLine.scala b/src/main/scala/millfork/assembly/AssemblyLine.scala
index f96cf5d2..f11ab274 100644
--- a/src/main/scala/millfork/assembly/AssemblyLine.scala
+++ b/src/main/scala/millfork/assembly/AssemblyLine.scala
@@ -1,9 +1,7 @@
 package millfork.assembly
 
-import java.lang.management.MemoryType
-
+import millfork.{CompilationFlag, CompilationOptions}
 import millfork.assembly.Opcode._
-import millfork.assembly.opt.ReadsA
 import millfork.compiler.{CompilationContext, MfCompiler}
 import millfork.env._
 
@@ -11,148 +9,317 @@ import millfork.env._
 object OpcodeClasses {
 
   val ReadsAAlways = Set(
-    ADC, AND, BIT, CMP, EOR, ORA, PHA, SBC, STA, TAX, TAY,
-    SAX, SBX, ANC, DCP, ISC, RRA, RLA, SRE, SLO, LXA, XAA, AHX, TAS
+    ADC, AND, BIT, CMP, EOR, ORA, PHA, SBC, STA,
+    ADC_W, AND_W, BIT_W, CMP_W, EOR_W, ORA_W, PHA_W, SBC_W, STA_W,
+    TAX, TAY,
+    SAX, SBX, ANC, DCP, ISC, RRA, RLA, SRE, SLO, LXA, XAA, AHX, TAS,
+    TAZ, TAB,
+    HuSAX, SAY, TAM,
+    TCD, TCS, XBA,
+  )
+  val ReadsAIfImplied = Set(
+    ASL, LSR, ROL, ROR, INC, DEC,
+    DEC_W, INC_W, ROL_W, ROR_W, ASL_W, LSR_W,
+  )
+  val ReadsAHAlways = Set(
+    ADC_W, AND_W, BIT_W, CMP_W, EOR_W, ORA_W, PHA_W, SBC_W, STA_W,
+    TCD, TCS, XBA,
+  )
+  val ReadsAHIfImplied = Set(
+    DEC_W, INC_W, ROL_W, ROR_W, ASL_W, LSR_W,
   )
-  val ReadsAIfImplied = Set(ASL, LSR, ROL, ROR, INC, DEC)
   val ReadsXAlways = Set(
-    CPX, DEX, INX, STX, TXA, TXS, SBX,
-    PLX,
-    XAA, SAX, AHX, SHX, TAS
+    CPX, DEX, INX, STX,
+    CPX_W, DEX_W, INX_W, STX_W,
+    TXA, TXS, SBX,
+    PLX, PLX_W,
+    XAA, SAX, AHX, SHX, TAS,
+    HuSAX, SXY,
+    TXY,
   )
-  val ReadsYAlways = Set(CPY, DEY, INY, STY, TYA, PLY, SHY)
+  val ReadsYAlways = Set(CPY, DEY, INY, STY, TYA, PLY, SHY, SAY, SXY, TYX)
+  val ReadsIZAlways = Set(CPZ, DEZ, INZ, STZ, TZA, PLZ)
+  val ReadsM = Set(
+    ORA, AND, EOR, ADC, SBC, CMP, LDA, STA,
+    ORA_W, AND_W, EOR_W, ADC_W, SBC_W, CMP_W, LDA_W, STA_W,
+    STZ, BIT,
+    STZ_W, BIT_W,
+    PHA, PLA,
+    PHA_W, PLA_W,
+    DEC, INC, ROL, ROR, ASL, LSR,
+    DEC_W, INC_W, ROL_W, ROR_W, ASL_W, LSR_W,
+    TAX, TXA, TAY, TYA)
+  val ReadsW = Set(
+    LDX, LDY, CPX, CPY, STX, STY, INX, INY, DEX, DEY,
+    LDX_W, LDY_W, CPX_W, CPY_W, STX_W, STY_W, INX_W, INY_W, DEX_W, DEY_W,
+    PLX, PLY, PHX, PHY,
+    PLX_W, PLY_W, PHX_W, PHY_W,
+    TAX, TXA, TAY, TYA, TXY, TYX)
   val ReadsZ = Set(BNE, BEQ, PHP)
   val ReadsN = Set(BMI, BPL, PHP)
   val ReadsNOrZ = ReadsZ ++ ReadsN
   val ReadsV = Set(BVS, BVC, PHP)
-  val ReadsD = Set(PHP, ADC, SBC, RRA, ARR, ISC, DCP) // TODO: ??
+  val ReadsD = Set(PHP, ADC, SBC, RRA, ARR, ISC, ADC_W, SBC_W)
   val ReadsC = Set(
-    PHP, ADC, SBC, BCC, BCS, ROL, ROR,
-    ALR, ARR, ISC, RLA, RRA, SLO, SRE // TODO: ??
+    PHP, BCC, BCS,
+    ADC, SBC, ROL, ROR,
+    ADC_W, SBC_W, ROL_W, ROR_W,
+    ALR, ARR, ISC, RLA, RRA, SLO, SRE,
+    XCE
   )
+
   val ChangesAAlways = Set(
-    TXA, TYA, PLA,
+    TXA, TYA, PLA, PLA_W,
     ORA, AND, EOR, ADC, LDA, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, LDA_W, SBC_W,
     SLO, RLA, SRE, RRA, LAX, ISC,
-    XAA, ANC, ALR, ARR, LXA, LAS
+    XAA, ANC, ALR, ARR, LXA, LAS,
+    TZA, NEG,
+    TMA,
+    XBA, TDC,
   )
-  val ChangesAIfImplied = Set(ASL, LSR, ROL, ROR, INC, DEC)
+
+  val ChangesAIfImplied = Set(
+    ASL, LSR, ROL, ROR, INC, DEC,
+    ASL_W, LSR_W, ROL_W, ROR_W, INC_W, DEC_W,
+  )
+
+  val ChangesAHAlways = Set(
+    PLA_W,
+    ORA_W, AND_W, EOR_W, ADC_W, LDA_W, SBC_W,
+    XBA, TDC,
+  )
+
+  val ChangesAHIfImplied = Set(
+    ASL, LSR, ROL, ROR, INC, DEC,
+    ASL_W, LSR_W, ROL_W, ROR_W, INC_W, DEC_W,
+  )
+
   val ChangesX = Set(
-    DEX, INX, TAX, LDX, TSX,
+    DEX, INX, LDX,
+    DEX_W, INX_W, LDX_W,
+    TAX, TSX,
     SBX, LAX, LXA, LAS,
-    PLX,
+    PLX, PLX_W,
+    TYX, SXY,
   )
   val ChangesY = Set(
-    DEY, INY, TAY, LDY
+    DEY, INY, LDY,
+    DEY_W, INY_W, LDY_W,
+    TAY,
+    PLY, PLY_W,
+    TXY, SXY,
+  )
+  val ChangesIZ = Set(
+    DEZ, INZ, TAZ, LDZ,
   )
   val ChangesS = Set(
-    PHA, PLA, PHP, PLP, TXS,
-    PHX, PHY, PLX, PLY, TAS, LAS
+    PHA, PLA,
+    PHA_W, PLA_W,
+    PHP, PLP, TXS,
+    PHX, PHY, PLX, PLY,
+    PHX_W, PHY_W, PLX_W, PLY_W,
+    TAS, LAS,
+    PHZ,
+    PHB, PHD, PHK, PLB, PLD, RTL,
+    PEA, PEI, PER,
+    XCE, TCS, TYS,
   )
   val ChangesMemoryAlways = Set(
     STA, STY, STZ, STX,
+    STA_W, STY_W, STZ_W, STX_W,
     TRB, TSB,
     SAX, DCP, ISC,
     SLO, RLA, SRE, RRA,
-    AHX, SHY, SHX, TAS, LAS
+    AHX, SHY, SHX, TAS, LAS,
+    COP,
   )
   val ChangesMemoryIfNotImplied = Set(
-    DEC, INC, ASL, ROL, LSR, ROR
+    DEC, INC, ASL, ROL, LSR, ROR,
+    DEC_W, INC_W, ASL_W, ROL_W, LSR_W, ROR_W,
   )
   val ReadsMemoryIfNotImpliedOrImmediate = Set(
-    LDY, CPX, CPY,
+    LDY, CPX, CPY, BIT,
+    LDY_W, CPX_W, CPY_W, BIT_W,
     ORA, AND, EOR, ADC, LDA, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, LDA_W, CMP_W, SBC_W,
     ASL, ROL, LSR, ROR, LDX, DEC, INC,
+    ASL_W, ROL_W, LSR_W, ROR_W, LDX_W, DEC_W, INC_W,
     SLO, RLA, SRE, RRA, LAX, DCP, ISC,
     LAS,
-    TRB, TSB
+    TRB, TSB,
+    TRB_W, TSB_W,
   )
+
+  val AccessesWordInMemory = Set(
+    LDA_W, LDX_W, LDY_W,
+    STA_W, STX_W, STY_W,
+    CMP_W, CPX_W, CPY_W,
+    DEC_W, INC_W, ASL_W, ROL_W, LSR_W, ROR_W,
+    ORA_W, AND_W, EOR_W, ADC_W, SBC_W,
+    TSB_W, TRB_W, BIT_W,
+    PHW,
+  )
+
+  val AccessesWordInMemoryAlwaysIfNotImplied = Set(
+  )
+
   val OverwritesA = Set(
-    LDA, PLA, TXA, TYA,
-    LAX, LAS
+    LDA, PLA,
+    LDA_W, PLA_W,
+    TXA, TYA,
+    LAX, LAS,
+    TBA, TZA,
+    HuSAX, SAY,
+    TDC, TSC,
+  )
+  val OverwritesAH = Set(
+    LDA_W, PLA_W,
+    TDC, TSC,
   )
   val OverwritesX = Set(
     TAX, LDX, TSX, PLX,
-    LAX, LAS
+    LAX, LAS,
+    TYX, HuSAX, SXY,
   )
   val OverwritesY = Set(
-    TAY, LDY, PLY
+    TAY, LDY, PLY,
+    TSY, TXY, SAY, SXY,
   )
-  val OverwritesC = Set(CLC, SEC, PLP)
+  val OverwritesIZ = Set(
+    TAZ, LDZ, PLZ,
+  )
+  val OverwritesC = Set(CLC, SEC, PLP, XCE)
   val OverwritesD = Set(CLD, SED, PLP)
   val OverwritesI = Set(CLI, SEI, PLP)
   val OverwritesV = Set(CLV, PLP)
   val ConcernsAAlways = ReadsAAlways ++ ChangesAAlways
+  val ConcernsAHAlways = ReadsAHAlways ++ ChangesAHAlways
   val ConcernsAIfImplied = ReadsAIfImplied ++ ChangesAIfImplied
+  val ConcernsAHIfImplied = ReadsAHIfImplied ++ ChangesAHIfImplied
   val ConcernsXAlways = ReadsXAlways | ChangesX
   val ConcernsYAlways = ReadsYAlways | ChangesY
+  val ConcernsIZAlways = ReadsIZAlways | ChangesIZ
 
   val ChangesStack = Set(
     PHA, PLA, PHP, PLP,
     PHX, PLX, PHY, PLY,
+    PHA_W, PLA_W,
+    PHX_W, PLX_W, PHY_W, PLY_W,
     TXS,
     JSR, RTS, RTI,
     TAS, LAS,
+    PHW, PHZ, PLZ,
+    TYS, TCS,
+    RTL, BSR,
+    PHB, PHD, PHK, PLB, PLD,
+    PEA, PEI, PER,
+    XCE,
   )
 
-  val ConcernsStack = ChangesStack + TSX
+  val ConcernsStackAlways = ChangesStack ++ Set(TSX, TSY, TSC)
+  val ConcernsS = ChangesS ++ Set(TSX, TSY, TSC)
 
   val ChangesNAndZ = Set(
     ADC, AND, ASL, BIT, CMP, CPX, CPY, DEC, DEX, DEY, EOR, INC, INX, INY, LDA,
+    ADC_W, AND_W, ASL_W, BIT_W, CMP_W, CPX_W, CPY_W, DEC_W, DEX_W, DEY_W, EOR_W, INC_W, INX_W, INY_W, LDA_W,
     LDX, LDY, LSR, ORA, PLP, ROL, ROR, SBC, TAX, TAY, TXA, TYA,
+    LDX_W, LDY_W, LSR_W, ORA_W, ROL_W, ROR_W, SBC_W,
     LAX, SBX, ANC, ALR, ARR, DCP, ISC, RLA, RRA, SLO, SRE, SAX,
-    TSB, TRB // These two do not change N, but lets pretend they do for simplicity
+    TSB, TRB, // These two do not change N, but lets pretend they do for simplicity
+    TSB_W, TRB_W,
+    NEG, ASR,
+    CPZ, DEZ, INZ, LDZ,
+    REP, SEP, // People usually don't use there to change N or Z, but let's assume they do
   )
   val ChangesC = Set(
-    CLC, SEC, ADC, ASL, CMP, CPX, CPY, LSR, PLP, ROL, ROR, SBC,
-    SBX, ANC, ALR, ARR, DCP, ISC, RLA, RRA, SLO, SRE
+    PLP, CLC, SEC,
+    ADC, ASL, CMP, CPX, CPY, LSR, ROL, ROR, SBC,
+    ADC_W, ASL_W, CMP_W, CPX_W, CPY_W, LSR_W, ROL_W, ROR_W, SBC_W,
+    SBX, ANC, ALR, ARR, DCP, ISC, RLA, RRA, SLO, SRE,
+    CPZ, ASR,
+    XCE,
+    REP, SEP, // People usually don't use there to change C, but let's assume they do
   )
   val ChangesV = Set(
     ADC, BIT, PLP, SBC,
     ARR, ISC, RRA,
+    REP, SEP, // People usually don't use there to change V, but let's assume they do
   )
 
   val SupportsAbsoluteX = Set(
     ORA, AND, EOR, ADC, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, CMP_W, SBC_W,
     ASL, ROL, LSR, ROR, DEC, INC,
+    ASL_W, ROL_W, LSR_W, ROR_W, DEC_W, INC_W,
     SLO, RLA, SRE, RRA, DCP, ISC,
-    STA, LDA, LDY, STZ, SHY,
+    STA, LDA, LDY, STZ,
+    STA_W, LDA_W, LDY_W, STZ_W,
+    SHY,
   )
 
   val SupportsAbsoluteY = Set(
     ORA, AND, EOR, ADC, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, CMP_W, SBC_W,
     SLO, RLA, SRE, RRA, DCP, ISC,
     STA, LDA, LDX,
+    STA_W, LDA_W, LDX_W,
     LAX, AHX, SHX, TAS, LAS,
   )
 
   val SupportsAbsolute = Set(
     ORA, AND, EOR, ADC, STA, LDA, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, STA_W, LDA_W, CMP_W, SBC_W,
     ASL, ROL, LSR, ROR, STX, LDX, DEC, INC,
+    ASL_W, ROL_W, LSR_W, ROR_W, STX_W, LDX_W, DEC_W, INC_W,
     SLO, RLA, SRE, RRA, SAX, LAX, DCP, ISC,
     STY, LDY,
     BIT, JMP, JSR,
     STZ, TRB, TSB,
+    LDZ,
   )
 
-  val SupportsZeroPageIndirect = Set(ORA, AND, EOR, ADC, STA, LDA, CMP, SBC)
+  val SupportsIndexedZ = Set(
+    ORA, AND, EOR, ADC, STA, LDA, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, STA_W, LDA_W, CMP_W, SBC_W,
+  )
+
+  val SupportsLongIndexedZ = Set(
+    ORA, AND, EOR, ADC, STA, LDA, CMP, SBC,
+    ORA_W, AND_W, EOR_W, ADC_W, STA_W, LDA_W, CMP_W, SBC_W,
+  )
 
   val ShortConditionalBranching = Set(BEQ, BNE, BMI, BPL, BVC, BVS, BCC, BCS)
   val ShortBranching = ShortConditionalBranching + BRA
-  val AllDirectJumps = ShortBranching + JMP
+  val AllDirectJumps = ShortBranching ++ Set(JMP, BRL)
   val AllLinear = Set(
     ORA, AND, EOR,
+    ORA_W, AND_W, EOR_W,
     ADC, SBC, CMP, CPX, CPY,
+    ADC_W, SBC_W, CMP_W, CPX_W, CPY_W,
     DEC, DEX, DEY, INC, INX, INY,
+    DEC_W, DEX_W, DEY_W, INC_W, INX_W, INY_W,
     ASL, ROL, LSR, ROR,
+    ASL_W, ROL_W, LSR_W, ROR_W,
     LDA, STA, LDX, STX, LDY, STY,
+    LDA_W, STA_W, LDX_W, STX_W, LDY_W, STY_W,
     TAX, TXA, TAY, TYA, TXS, TSX,
     PLA, PLP, PHA, PHP,
-    BIT, NOP,
+    PLA_W, PHA_W,
+    BIT, BIT_W, NOP,
     CLC, SEC, CLD, SED, CLI, SEI, CLV,
     STZ, PHX, PHY, PLX, PLY, TSB, TRB,
+    STZ_W, PHX_W, PHY_W, PLX_W, PLY_W, TSB_W, TRB_W,
     SLO, RLA, SRE, RRA, SAX, LAX, DCP, ISC,
     ANC, ALR, ARR, XAA, LXA, SBX,
+    CPZ, LDZ, INZ, DEZ,
+    TAZ, TZA, TYS, TSY,
+    TBA,
+    PLZ, PHZ, PHW,
+    CLA, CLX, CLY,
+    CSH, CSL,
+    TXY, TYX, XBA,
+    PHD, PHB, PHK,
     DISCARD_AF, DISCARD_XF, DISCARD_YF)
 
   val NoopDiscardsFlags = Set(DISCARD_AF, DISCARD_XF, DISCARD_YF)
@@ -165,6 +332,11 @@ object OpcodeClasses {
 
 object AssemblyLine {
 
+  val accu8: AssemblyLine = AssemblyLine.immediate(SEP, 0x20)
+  val accu16: AssemblyLine = AssemblyLine.immediate(REP, 0x20)
+  val index8: AssemblyLine = AssemblyLine.immediate(SEP, 0x10)
+  val index16: AssemblyLine = AssemblyLine.immediate(REP, 0x10)
+
   def treatment(lines: List[AssemblyLine], state: State.Value): Treatment.Value =
     lines.map(_.treatment(state)).foldLeft(Treatment.Unchanged)(_ ~ _)
 
@@ -216,7 +388,12 @@ object AssemblyLine {
         case v@RelativeVariable(_, _, _, true) =>
           List(AssemblyLine.zeropage(opcode, v.toAddress + offset))
         case v: VariableInMemory => List(AssemblyLine.absolute(opcode, v.toAddress + offset))
-        case v: StackVariable => List(AssemblyLine.implied(TSX), AssemblyLine.absoluteX(opcode, v.baseOffset + offset + ctx.extraStackOffset))
+        case v: StackVariable =>
+          if (ctx.options.flag(CompilationFlag.EmitEmulation65816Opcodes)) {
+            List(AssemblyLine.stackRelative(opcode, v.baseOffset + offset + ctx.extraStackOffset))
+          } else {
+            List(AssemblyLine.implied(TSX), AssemblyLine.absoluteX(opcode, v.baseOffset + offset + ctx.extraStackOffset))
+          }
       }
     }
 
@@ -229,6 +406,10 @@ object AssemblyLine {
   def absolute(opcode: Opcode.Value, addr: Constant) =
     AssemblyLine(opcode, AddrMode.Absolute, addr)
 
+  def absoluteOrLongAbsolute(opcode: Opcode.Value, thing: ThingInMemory, options: CompilationOptions) =
+    if (thing.isFar(options)) AssemblyLine(opcode, AddrMode.LongAbsolute, thing.toAddress)
+    else AssemblyLine(opcode, AddrMode.Absolute, thing.toAddress)
+
   def absolute(opcode: Opcode.Value, thing: ThingInMemory, offset: Int = 0) =
     AssemblyLine(opcode, AddrMode.Absolute, thing.toAddress + offset)
 
@@ -258,6 +439,9 @@ object AssemblyLine {
 
   def indexedY(opcode: Opcode.Value, thing: ThingInMemory, offset: Int = 0) =
     AssemblyLine(opcode, AddrMode.IndexedY, thing.toAddress + offset)
+
+  def stackRelative(opcode: Opcode.Value, addr: Int) =
+    AssemblyLine(opcode, AddrMode.Stack, NumericConstant(addr & 0xff, 1))
 }
 
 case class AssemblyLine(opcode: Opcode.Value, addrMode: AddrMode.Value, var parameter: Constant, elidable: Boolean = true) {
@@ -270,43 +454,65 @@ case class AssemblyLine(opcode: Opcode.Value, addrMode: AddrMode.Value, var para
 
   def reads(state: State.Value): Boolean = state match {
     case A => if (addrMode == Implied) ReadsAIfImplied(opcode) else ReadsAAlways(opcode)
-    case X => addrMode == AbsoluteX || addrMode == ZeroPageX || addrMode == IndexedX || ReadsXAlways(opcode)
-    case Y => addrMode == AbsoluteY || addrMode == ZeroPageY || addrMode == IndexedY || ReadsYAlways(opcode)
+    case AH => if (addrMode == Implied) ReadsAHIfImplied(opcode) else ReadsAHAlways(opcode)
+    case X => addrMode == AbsoluteX || addrMode == LongAbsoluteX || addrMode == ZeroPageX || addrMode == IndexedX || ReadsXAlways(opcode)
+    case Y => addrMode == AbsoluteY || addrMode == ZeroPageY || addrMode == IndexedY || addrMode == LongIndexedY || ReadsYAlways(opcode)
     case C => ReadsC(opcode)
     case D => ReadsD(opcode)
     case N => ReadsN(opcode)
     case V => ReadsV(opcode)
     case Z => ReadsZ(opcode)
+    case IZ => addrMode == IndexedZ || ReadsIZAlways(opcode)
+    case M => ReadsM(opcode)
+    case W => ReadsW(opcode)
   }
 
   def treatment(state: State.Value): Treatment.Value = opcode match {
     case LABEL => Unchanged // TODO: ???
     case NOP => Unchanged
-    case JSR | JMP | BEQ | BNE | BMI | BPL | BRK | BCC | BVC | BCS | BVS => Changed
+    case JSR | JMP | BEQ | BNE | BMI | BPL | BRK | BCC | BVC | BCS | BVS | BSR => Changed
     case CLC => if (state == C) Cleared else Unchanged
     case SEC => if (state == C) Set else Unchanged
     case CLV => if (state == V) Cleared else Unchanged
     case CLD => if (state == D) Cleared else Unchanged
     case SED => if (state == D) Set else Unchanged
+    case SEP => parameter match {
+      case NumericConstant(n, _) =>
+        if (isAffectedBySepRep(state, n)) Set else Unchanged
+      case _ => Changed
+    }
+    case REP => parameter match {
+      case NumericConstant(n, _) =>
+        if (isAffectedBySepRep(state, n)) Cleared else Unchanged
+      case _ => Changed
+    }
+    case XCE => Changed
     case _ => state match { // TODO: smart detection of constants
       case A =>
         if (ChangesAAlways(opcode) || addrMode == Implied && ChangesAIfImplied(opcode))
           Changed
         else
           Unchanged
+      case AH =>
+        if (ChangesAHAlways(opcode) || addrMode == Implied && ChangesAHIfImplied(opcode))
+          Changed
+        else
+          Unchanged
       case X => if (ChangesX(opcode)) Changed else Unchanged
       case Y => if (ChangesY(opcode)) Changed else Unchanged
+      case IZ => if (ChangesIZ(opcode)) Changed else Unchanged
       case C => if (ChangesC(opcode)) Changed else Unchanged
       case V => if (ChangesV(opcode)) Changed else Unchanged
       case N | Z => if (ChangesNAndZ(opcode)) Changed else Unchanged
-      case D => Unchanged
+      case W | M | D => Unchanged
     }
   }
 
   def sizeInBytes: Int = addrMode match {
     case Implied => 1
-    case Relative | ZeroPageX | ZeroPage | ZeroPageY | ZeroPageIndirect | IndexedX | IndexedY | Immediate => 2
-    case AbsoluteIndexedX | AbsoluteX | Absolute | AbsoluteY | Indirect => 3
+    case Relative | ZeroPageX | ZeroPage | ZeroPageY | IndexedZ | IndexedX | IndexedY | IndexedSY | Stack | LongIndexedY | LongIndexedZ | Immediate => 2
+    case AbsoluteIndexedX | AbsoluteX | Absolute | AbsoluteY | Indirect | LongRelative | WordImmediate => 3
+    case LongAbsolute | LongAbsoluteX | LongIndirect => 4
     case DoesNotExist => 0
   }
 
@@ -314,10 +520,17 @@ case class AssemblyLine(opcode: Opcode.Value, addrMode: AddrMode.Value, var para
     case Implied => 1000
     case Relative | Immediate => 2000
     case ZeroPage => 2001
-    case ZeroPageX | ZeroPageY => 2002
-    case IndexedX | IndexedY => 2003
-    case Absolute => 3000
-    case AbsoluteX | AbsoluteY | Indirect => 3001
+    case Stack | ZeroPageX | ZeroPageY => 2002
+    case IndexedX | IndexedY | IndexedZ => 2003
+    case IndexedSY | LongIndexedY | LongIndexedZ => 2004
+    case WordImmediate => 3000
+    case Absolute => 3001
+    case AbsoluteX | AbsoluteY | Indirect => 3002
+    case AbsoluteIndexedX => 3003
+    case LongAbsolute => 4000
+    case LongAbsoluteX => 4001
+    case LongIndirect => 4002
+    case TripleAbsolute => 7000
     case DoesNotExist => 1
   }
 
@@ -329,6 +542,10 @@ case class AssemblyLine(opcode: Opcode.Value, addrMode: AddrMode.Value, var para
     } else if (addrMode == DoesNotExist) {
       s"    ; $opcode"
     } else {
-      s"    $opcode ${AddrMode.addrModeToString(addrMode, parameter.toString)}"
+      val op = opcode match {
+        case HuSAX => "SAX"
+        case _ => opcode.toString
+      }
+      s"    $op ${AddrMode.addrModeToString(addrMode, parameter.toString)}"
     }
 }
diff --git a/src/main/scala/millfork/assembly/Opcode.scala b/src/main/scala/millfork/assembly/Opcode.scala
index 02c82bea..7b79cd48 100644
--- a/src/main/scala/millfork/assembly/Opcode.scala
+++ b/src/main/scala/millfork/assembly/Opcode.scala
@@ -6,7 +6,38 @@ import millfork.error.ErrorReporting
 import millfork.node.Position
 
 object State extends Enumeration {
-  val A, X, Y, Z, D, C, N, V = Value
+  val
+  // standard 6502 8-bit registers
+  A, X, Y,
+  // hi bytes of registers on 65816
+  AH, XH, YH,
+  // extra 65816 registers
+  DP, DBL, PB,
+  // extra register of both 65816 (it's called the high byte of DB) and 65CE02 (it's called B)
+  DBH,
+  // extra 65CE02 register
+  IZ,
+  // 8 extra HuC6280 MMU registers
+  MM,
+  // standard 6502 flags
+  Z, D, C, N, V,
+  // extra 65816 flags; W means X flag, not to confuse with the X register
+  E, M, W = Value
+
+
+
+  def isAffectedBySepRep(state: State.Value, n: Long): Boolean = {
+    state match {
+      case C => (n & 1) == 0
+      case Z => (n & 2) == 0
+      case D => (n & 8) == 0
+      case W | XH | YH => (n & 0x10) == 0
+      case M => (n & 0x20) == 0
+      case V => (n & 0x40) == 0
+      case N => (n & 0x80) == 0
+      case _ => false
+    }
+  }
 }
 
 object Treatment extends Enumeration {
@@ -38,14 +69,92 @@ object Opcode extends Enumeration {
   SBC, SEC, SED, SEI, STA, STX, STY,
   TAX, TAY, TXA, TXS, TSX, TYA,
 
+  // illegals:
   LXA, XAA, ANC, ARR, ALR, SBX,
   LAX, SAX, RLA, RRA, SLO, SRE, DCP, ISC,
   TAS, LAS, SHX, SHY, AHX,
+
+  // 65C02:
   STZ, PHX, PHY, PLX, PLY,
   BRA, TRB, TSB, STP, WAI,
+  // BBR, BBS, RMB, SMB,
+
+  // 65CE02:
+  CPZ, LDZ, DEZ, INZ,
+  PHW,
+  // DEW, INW, ASW, ROW, // aliases for DEC_W, INC_W, ASL_W, ROL_W (?)
+  NEG, ASR,
+  TAZ, TZA, PHZ, PLZ,
+  TSY, TYS,
+  TAB, TBA,
+  // CLE, SEE,
+  BSR,
+  // MAP,
+
+  //HuC6280:
+  CLA, CLX, CLY,
+  CSH, CSL,
+  SAY, SXY, HuSAX,
+  // SET,
+  // ST0, ST1, ST2,
+  // BSR, // the same as on 65CE02
+  TAM, TMA,
+  // TAI, TIA, TDD, TIN, TII, // memcpy instructions
+  TST,
+
+  //65816:
+  BRL,
+  COP,
+  // MVN, MVP,
+  PEA, PEI, PER,
+  PHB, PHD, PHK, PLB, PLD, // there's no PLK for the same reason Intel removed POP CS from 80186
+  REP, SEP,
+  RTL,
+  TCD, TDC, TSC, TCS,
+  TXY, TYX, XBA,
+  XCE,
+  DEC_W, INC_W, ROL_W, ROR_W, ASL_W, LSR_W,
+  ORA_W, AND_W, EOR_W, ADC_W, LDA_W, STA_W, CMP_W, SBC_W, STZ_W, BIT_W, TRB_W, TSB_W,
+  LDX_W, LDY_W, STX_W, STY_W, CPX_W, CPY_W,
+  INX_W, INY_W, DEX_W, DEY_W,
+  PHA_W, PLA_W,
+  PHX_W, PHY_W, PLY_W, PLX_W,
+
   DISCARD_AF, DISCARD_XF, DISCARD_YF,
   LABEL = Value
 
+  def widen(opcode: Opcode.Value): Option[Opcode.Value] = opcode match {
+    case ORA => Some(ORA_W)
+    case AND => Some(AND_W)
+    case EOR => Some(EOR_W)
+    case ADC => Some(ADC_W)
+    case SBC => Some(SBC_W)
+    case CMP => Some(CMP_W)
+    case LDA => Some(LDA_W)
+    case STA => Some(STA_W)
+    case STZ => Some(STZ_W)
+
+    case LDX => Some(LDX_W)
+    case LDY => Some(LDY_W)
+    case STX => Some(STX_W)
+    case STY => Some(STY_W)
+    case INX => Some(INX_W)
+    case INY => Some(INX_W)
+    case DEX => Some(DEX_W)
+    case DEY => Some(DEY_W)
+    case CPX => Some(CPX_W)
+    case CPY => Some(CPY_W)
+
+    case INC => Some(INC_W)
+    case DEC => Some(DEC_W)
+    case ROL => Some(ROL_W)
+    case ROR => Some(ROR_W)
+    case ASL => Some(ASL_W)
+    case LSR => Some(LSR_W)
+
+    case _ => None
+  }
+
   def lookup(opcode: String, position: Option[Position]): Opcode.Value = opcode.toUpperCase(Locale.ROOT) match {
     case "ADC" => ADC
     case "AHX" => AHX
@@ -56,8 +165,10 @@ object Opcode extends Enumeration {
     case "ARR" => ARR
     case "ASL" => ASL
     case "ASO" => SLO
+    case "ASR" => ASR
+    case "ASW" => ASL_W
     case "AXA" => AHX
-    case "AXS" => SBX // TODO: could mean SAX
+    case "AXS" => SBX // could mean SAX
     case "BCC" => BCC
     case "BCS" => BCS
     case "BEQ" => BEQ
@@ -67,6 +178,8 @@ object Opcode extends Enumeration {
     case "BPL" => BPL
     case "BRA" => BRA
     case "BRK" => BRK
+    case "BRL" => BRL
+    case "BSR" => BSR
     case "BVC" => BVC
     case "BVS" => BVS
     case "CLC" => CLC
@@ -74,18 +187,24 @@ object Opcode extends Enumeration {
     case "CLI" => CLI
     case "CLV" => CLV
     case "CMP" => CMP
+    case "COP" => COP
     case "CPX" => CPX
     case "CPY" => CPY
+    case "CPZ" => CPZ
     case "DCM" => DCP
     case "DCP" => DCP
     case "DEC" => DEC
+    case "DEW" => DEC_W
     case "DEX" => DEX
     case "DEY" => DEY
+    case "DEZ" => DEZ
     case "EOR" => EOR
     case "INC" => INC
     case "INS" => ISC
+    case "INW" => INC_W
     case "INX" => INX
     case "INY" => INY
+    case "INZ" => INZ
     case "ISC" => ISC
     case "JMP" => JMP
     case "JSR" => JSR
@@ -94,33 +213,48 @@ object Opcode extends Enumeration {
     case "LDA" => LDA
     case "LDX" => LDX
     case "LDY" => LDY
+    case "LDZ" => LDZ
     case "LSE" => SRE
     case "LSR" => LSR
     case "LXA" => LXA
+    case "NEG" => NEG
     case "NOP" => NOP
     case "OAL" => LXA
     case "ORA" => ORA
+    case "PEA" => PEA
+    case "PEI" => PEI
+    case "PER" => PER
     case "PHA" => PHA
+    case "PHB" => PHB
+    case "PHD" => PHD
+    case "PHK" => PHK
     case "PHP" => PHP
+    case "PHW" => PHW
     case "PHX" => PHX
     case "PHY" => PHY
     case "PLA" => PLA
+    case "PLB" => PLB
+    case "PLD" => PLD
     case "PLP" => PLP
     case "PLX" => PLX
     case "PLY" => PLY
+    case "REP" => REP
     case "RLA" => RLA
     case "ROL" => ROL
     case "ROR" => ROR
+    case "ROW" => ROR_W // TODO: is this correct?
     case "RRA" => RRA
     case "RTI" => RTI
+    case "RTL" => RTL
     case "RTS" => RTS
-    case "SAX" => SAX // TODO: could mean SBX
-    case "SAY" => SHY
+    case "SAX" => SAX // could mean SBX; also, HuC6280 has another SAX that means something else
+    case "SAY" => SAY // could mean SHY
     case "SBC" => SBC
     case "SBX" => SBX
     case "SEC" => SEC
     case "SED" => SED
     case "SEI" => SEI
+    case "SEP" => SEP
     case "SHX" => SHX
     case "SHY" => SHY
     case "SLO" => SLO
@@ -130,18 +264,36 @@ object Opcode extends Enumeration {
     case "STX" => STX
     case "STY" => STY
     case "STZ" => STZ
+    case "TAB" => TAB
+    case "TAM" => TAM
     case "TAS" => TAS
     case "TAX" => TAX
     case "TAY" => TAY
+    case "TAZ" => TAZ
+    case "TBA" => TBA
+    case "TCD" => TCD
+    case "TDC" => TDC
+    case "TCS" => TCS
+    case "TSC" => TSC
+    case "TMA" => TMA
     case "TRB" => TRB
     case "TSB" => TSB
     case "TSX" => TSX
+    case "TSY" => TSY
     case "TXA" => TXA
     case "TXS" => TXS
+    case "TXY" => TXY
     case "TYA" => TYA
+    case "TYS" => TYS
+    case "TYX" => TYX
+    case "TZA" => TZA
     case "WAI" => WAI
     case "XAA" => XAA
     case "XAS" => SHX
+    case "XBA" => XBA
+    case "XCE" => XCE
+
+      // TODO: add all of those
     case _ =>
       ErrorReporting.error(s"Invalid opcode `$opcode`", position)
       LABEL
@@ -152,39 +304,51 @@ object Opcode extends Enumeration {
 object AddrMode extends Enumeration {
   val Implied,
   Immediate,
+  WordImmediate,
   Relative,
+  LongRelative,
   ZeroPage,
   ZeroPageX,
   ZeroPageY,
   Absolute,
   AbsoluteX,
   AbsoluteY,
+  LongAbsolute,
+  LongAbsoluteX,
   Indirect,
+  LongIndirect,
   IndexedX,
   IndexedY,
+  IndexedSY,
+  IndexedZ,
+  Stack,
+  LongIndexedY,
+  LongIndexedZ,
   AbsoluteIndexedX,
-  ZeroPageIndirect,
+  TripleAbsolute,
   Undecided,
   DoesNotExist = Value
 
 
-  def argumentLength(a: AddrMode.Value): Int = a match {
-    case Absolute | AbsoluteX | AbsoluteY | Indirect =>
-      2
-    case _ =>
-      1
-  }
-
   def addrModeToString(am: AddrMode.Value, argument: String): String = {
     am match {
       case Implied => ""
       case Immediate => "#" + argument
+      case WordImmediate => "##" + argument
       case AbsoluteX | ZeroPageX => argument + ", X"
       case AbsoluteY | ZeroPageY => argument + ", Y"
       case IndexedX | AbsoluteIndexedX => "(" + argument + ", X)"
+      case Stack => argument + ", S"
       case IndexedY => "(" + argument + "), Y"
-      case Indirect | ZeroPageIndirect => "(" + argument + ")"
+      case IndexedSY => "(" + argument + ", S), Y"
+      case IndexedZ => "(" + argument + "), Z"
+      case Indirect => "(" + argument + ")"
+      case LongIndexedY => "[" + argument + "], Y"
+      case LongIndexedZ => "[" + argument + "], Z"
+      case LongIndirect => "[" + argument + "]"
       case ZeroPage => argument // + "\t;zp"
+      case LongAbsolute => "FAR " + argument
+      case LongAbsoluteX => "FAR " + argument + ", X"
       case _ => argument;
     }
   }
diff --git a/src/main/scala/millfork/assembly/opt/AlwaysGoodOptimizations.scala b/src/main/scala/millfork/assembly/opt/AlwaysGoodOptimizations.scala
index c5e83201..4553f2f1 100644
--- a/src/main/scala/millfork/assembly/opt/AlwaysGoodOptimizations.scala
+++ b/src/main/scala/millfork/assembly/opt/AlwaysGoodOptimizations.scala
@@ -5,7 +5,7 @@ import java.util.concurrent.atomic.AtomicInteger
 import millfork.assembly.AddrMode._
 import millfork.assembly.Opcode._
 import millfork.assembly.OpcodeClasses._
-import millfork.assembly._
+import millfork.assembly.{opt, _}
 import millfork.env._
 
 /**
@@ -745,6 +745,21 @@ object AlwaysGoodOptimizations {
     (Elidable & HasOpcodeIn(Set(CLV)) & DoesntMatterWhatItDoesWith(State.V)) ~~> (_ => Nil),
   )
 
+  val SimplifiableStackOperation = new RuleBasedAssemblyOptimization("Simplifiable stack operation",
+    needsFlowInfo = FlowInfoRequirement.BackwardFlow,
+    (Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)) ~
+      (Elidable & HasOpcode(TXS)) ~
+      (Elidable & HasOpcode(PHA) & DoesntMatterWhatItDoesWith(State.Z, State.N)) ~~> (_ => List(AssemblyLine.implied(TSX), AssemblyLine.absoluteX(STA, 0x101))),
+    (Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)) ~
+      (Elidable & HasOpcode(TXS) & DoesntMatterWhatItDoesWith(State.Z, State.N, State.A)) ~~> (_ => List(AssemblyLine.implied(PLA))),
+    (Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)) ~
+      (Elidable & HasOpcode(TXS)) ~
+      (ConcernsA & Not(ConcernsStack) & Linear & DoesntMatterWhatItDoesWith(State.Z, State.N, State.A)) ~~> (code => List(code.last, AssemblyLine.implied(PLA))),
+  )
+
   val SimplifiableBitOpsSequence = new RuleBasedAssemblyOptimization("Simplifiable sequence of bit operations",
     needsFlowInfo = FlowInfoRequirement.NoRequirement,
     (Elidable & HasOpcode(EOR) & MatchImmediate(0)) ~
@@ -909,7 +924,7 @@ object AlwaysGoodOptimizations {
     val jump = Elidable & HasOpcodeIn(Set(JMP, if (firstSet) BCS else BCC, if (zeroIfSet) BEQ else BNE)) & MatchParameter(1)
     val elseLabel = Elidable & HasOpcode(LABEL) & MatchParameter(0)
     val afterLabel = Elidable & HasOpcode(LABEL) & MatchParameter(1) & DoesntMatterWhatItDoesWith(State.C, State.N, State.V, State.Z)
-    val store = Elidable & (Not(ReadsC) & Linear | HasOpcodeIn(Set(RTS, JSR, RTI)))
+    val store = Elidable & (Not(ReadsC) & Linear | HasOpcodeIn(Set(RTS, JSR, RTI, RTL, BSR)))
     val secondReturn = (Elidable & HasOpcodeIn(Set(RTS, RTI) | NoopDiscardsFlags)).*.capture(6)
     val where = Where { ctx =>
       ctx.get[List[AssemblyLine]](4) == ctx.get[List[AssemblyLine]](5) ||
@@ -1366,7 +1381,7 @@ object AlwaysGoodOptimizations {
           first.head.parameter == second.head.parameter &&
           (first.head.addrMode == Immediate) == (second.head.addrMode == Immediate) && first.tail.zip(second.tail).forall(p => {
           p._1.opcode == p._2.opcode && p._1.parameter.quickSimplify == p._2.parameter.quickSimplify && (p._1.addrMode == Immediate) == (p._2.addrMode == Immediate)
-        }) && (for (s1 <- first; s2 <- between) yield HelperCheckers.memoryAccessDoesntOverlap(s1.addrMode, s1.parameter, s2.addrMode, s2.parameter)).forall(identity) && {
+        }) && (for (s1 <- first; s2 <- between) yield HelperCheckers.memoryAccessDoesntOverlap(s1, s2)).forall(identity) && {
           var currentD = false
           var currentCDefined = false
           var noAdditionDependency = true
@@ -1415,10 +1430,10 @@ object AlwaysGoodOptimizations {
         true
       }) ~
       (Linear & DoesNotConcernMemoryAt(3,4) & DoesNotConcernMemoryAt(3,5)).* ~
-      (Elidable & MatchParameter(6) & HasAddrModeIn(Set(ZeroPageIndirect, IndexedY))) ~~> { (code, ctx) =>
+      (Elidable & MatchParameter(6) & HasAddrModeIn(Set(IndexedZ, IndexedY))) ~~> { (code, ctx) =>
       val addr = ctx.get[Int](2)
       val last = code.last
-      code.init :+ last.copy(parameter = NumericConstant(addr, 2), addrMode = if (last.addrMode == ZeroPageIndirect) Absolute else AbsoluteY)
+      code.init :+ last.copy(parameter = NumericConstant(addr, 2), addrMode = if (last.addrMode == IndexedZ) Absolute else AbsoluteY)
     },
 
     (HasOpcode(STA) & MatchA(0) & HasAddrModeIn(Set(Absolute, ZeroPage)) & MatchParameter(4)) ~
@@ -1437,10 +1452,10 @@ object AlwaysGoodOptimizations {
         true
       }) ~
       (Linear & DoesNotConcernMemoryAt(3,4) & DoesNotConcernMemoryAt(3,5)).* ~
-      (Elidable & MatchParameter(6) & HasAddrModeIn(Set(ZeroPageIndirect, IndexedY))) ~~> { (code, ctx) =>
+      (Elidable & MatchParameter(6) & HasAddrModeIn(Set(IndexedZ, IndexedY))) ~~> { (code, ctx) =>
       val addr = ctx.get[Int](2)
       val last = code.last
-      code.init :+ last.copy(parameter = NumericConstant(addr, 2), addrMode = if (last.addrMode == ZeroPageIndirect) Absolute else AbsoluteY)
+      code.init :+ last.copy(parameter = NumericConstant(addr, 2), addrMode = if (last.addrMode == IndexedZ) Absolute else AbsoluteY)
     },
   )
 
diff --git a/src/main/scala/millfork/assembly/opt/CE02Optimizations.scala b/src/main/scala/millfork/assembly/opt/CE02Optimizations.scala
new file mode 100644
index 00000000..d8f9f02a
--- /dev/null
+++ b/src/main/scala/millfork/assembly/opt/CE02Optimizations.scala
@@ -0,0 +1,20 @@
+package millfork.assembly.opt
+
+import millfork.assembly.AddrMode._
+import millfork.assembly.AssemblyLine
+import millfork.assembly.Opcode._
+import millfork.assembly.OpcodeClasses._
+
+/**
+  * @author Karol Stasiak
+  */
+object CE02Optimizations {
+
+  val UseAsr = new RuleBasedAssemblyOptimization("Use 65CE02 instruction ASR",
+    needsFlowInfo = FlowInfoRequirement.NoRequirement,
+    (Elidable & HasOpcode(CMP) & HasImmediate(0x80)) ~
+      (Elidable & HasOpcode(ROR)) ~~> (_ => List(AssemblyLine.implied(ASR))),
+  )
+
+  val All: List[AssemblyOptimization] = List(UseAsr)
+}
diff --git a/src/main/scala/millfork/assembly/opt/ChangeIndexRegisterOptimization.scala b/src/main/scala/millfork/assembly/opt/ChangeIndexRegisterOptimization.scala
index 3998e8cf..4ab61354 100644
--- a/src/main/scala/millfork/assembly/opt/ChangeIndexRegisterOptimization.scala
+++ b/src/main/scala/millfork/assembly/opt/ChangeIndexRegisterOptimization.scala
@@ -102,10 +102,15 @@ class ChangeIndexRegisterOptimization(preferX2Y: Boolean) extends AssemblyOptimi
     case AssemblyLine(_, AbsoluteY, _, _) :: xs if loaded != Some(Y) => false
     case AssemblyLine(_, ZeroPageY, _, _) :: xs if loaded != Some(Y) => false
     case AssemblyLine(_, IndexedY, _, _) :: xs if dir == Y2X || loaded != Some(Y) => false
+    case AssemblyLine(_, LongIndexedY, _, _) :: xs if dir == Y2X || loaded != Some(Y) => false
     case AssemblyLine(_, AbsoluteX, _, _) :: xs if loaded != Some(X) => false
+    case AssemblyLine(_, LongAbsoluteX, _, _) :: xs if loaded != Some(X) => false
     case AssemblyLine(_, ZeroPageX, _, _) :: xs if loaded != Some(X) => false
     case AssemblyLine(_, IndexedX, _, _) :: xs if dir == X2Y || loaded != Some(X) => false
     case AssemblyLine(_, AbsoluteIndexedX, _, _) :: xs if dir == X2Y => false
+    case AssemblyLine(SHX | SHY | AHX, _, _, _) :: xs => false
+    case AssemblyLine(TXY, _, _, e) :: xs => e && loaded == Some(X) && canOptimize(xs, dir, Some(Y))
+    case AssemblyLine(TYX, _, _, e) :: xs => e && loaded == Some(Y) && canOptimize(xs, dir, Some(X))
 
       // using a wrong index register for one instruction is fine
     case AssemblyLine(LDY | TAY, _, _, _) :: AssemblyLine(_, IndexedY, _, _) :: xs if dir == Y2X =>
@@ -121,13 +126,13 @@ class ChangeIndexRegisterOptimization(preferX2Y: Boolean) extends AssemblyOptimi
     case AssemblyLine(LDX | TAX, _, _, _) :: AssemblyLine(INX | DEX, _, _, _) :: AssemblyLine(INC | DEC | ASL | ROL | ROR | LSR | STZ, AbsoluteX | ZeroPageX, _, _) :: xs if dir == X2Y =>
       canOptimize(xs, dir, None)
 
-    case AssemblyLine(INC | DEC | ASL | ROL | ROR | LSR | STZ, AbsoluteX | ZeroPageX, _, _) :: xs if dir == X2Y => false
+    case AssemblyLine(INC | DEC | ASL | ROL | ROR | LSR | STZ | LDZ | BIT, AbsoluteX | ZeroPageX, _, _) :: xs if dir == X2Y => false
 
     case AssemblyLine(LAX, _, _, _) :: xs => false
-    case AssemblyLine(JSR, _, _, _) :: xs => false // TODO
-    case AssemblyLine(JMP, _, _, _) :: xs => canOptimize(xs, dir, None)
+    case AssemblyLine(JSR | BSR, _, _, _) :: xs => false // TODO
+    case AssemblyLine(JMP, Absolute, _, _) :: xs => canOptimize(xs, dir, None) // TODO
     case AssemblyLine(op, _, _, _) :: xs if OpcodeClasses.ShortBranching(op) => canOptimize(xs, dir, None)
-    case AssemblyLine(RTS, _, _, _) :: xs => canOptimize(xs, dir, None)
+    case AssemblyLine(RTS | RTL | BRA | BRL, _, _, _) :: xs => canOptimize(xs, dir, None)
     case AssemblyLine(LABEL, _, _, _) :: xs => canOptimize(xs, dir, None)
     case AssemblyLine(DISCARD_XF, _, _, _) :: xs => canOptimize(xs, dir, loaded.filter(_ != X))
     case AssemblyLine(DISCARD_YF, _, _, _) :: xs => canOptimize(xs, dir, loaded.filter(_ != Y))
@@ -137,9 +142,9 @@ class ChangeIndexRegisterOptimization(preferX2Y: Boolean) extends AssemblyOptimi
       (e || dir == Y2X) && canOptimize(xs, dir, Some(X))
     case AssemblyLine(TAY | LDY | PLY, _, _, e) :: xs =>
       (e || dir == X2Y) && canOptimize(xs, dir, Some(Y))
-    case AssemblyLine(TXA | STX | PHX | CPX | INX | DEX, _, _, e) :: xs =>
+    case AssemblyLine(TXA | STX | PHX | CPX | INX | DEX | HuSAX, _, _, e) :: xs =>
       (e || dir == Y2X) && loaded == Some(X) && canOptimize(xs, dir, Some(X))
-    case AssemblyLine(TYA | STY | PHY | CPY | INY | DEY, _, _, e) :: xs =>
+    case AssemblyLine(TYA | STY | PHY | CPY | INY | DEY | SAY, _, _, e) :: xs =>
       (e || dir == X2Y) && loaded == Some(Y) && canOptimize(xs, dir, Some(Y))
 
     case AssemblyLine(SAX | TXS | SBX, _, _, _) :: xs => dir == Y2X && loaded == Some(X) && canOptimize(xs, dir, Some(X))
@@ -167,17 +172,20 @@ class ChangeIndexRegisterOptimization(preferX2Y: Boolean) extends AssemblyOptimi
       :: xs => a :: i :: b :: switchX2Y(xs)
     case (x@AssemblyLine(TAX, _, _, _)) :: xs => x.copy(opcode = TAY) :: switchX2Y(xs)
     case (x@AssemblyLine(TXA, _, _, _)) :: xs => x.copy(opcode = TYA) :: switchX2Y(xs)
+    case (x@AssemblyLine(TXY | TYX, _, _, _)) :: xs => x.copy(opcode = TYX) :: switchX2Y(xs) // keep the transfer for the flags
     case (x@AssemblyLine(STX, _, _, _)) :: xs => x.copy(opcode = STY) :: switchX2Y(xs)
     case (x@AssemblyLine(LDX, _, _, _)) :: xs => x.copy(opcode = LDY) :: switchX2Y(xs)
     case (x@AssemblyLine(INX, _, _, _)) :: xs => x.copy(opcode = INY) :: switchX2Y(xs)
     case (x@AssemblyLine(DEX, _, _, _)) :: xs => x.copy(opcode = DEY) :: switchX2Y(xs)
     case (x@AssemblyLine(CPX, _, _, _)) :: xs => x.copy(opcode = CPY) :: switchX2Y(xs)
+    case (x@AssemblyLine(HuSAX, _, _, _)) :: xs => x.copy(opcode = SAY) :: switchX2Y(xs)
 
     case AssemblyLine(LAX, _, _, _) :: xs => ErrorReporting.fatal("Unexpected LAX")
     case AssemblyLine(TXS, _, _, _) :: xs => ErrorReporting.fatal("Unexpected TXS")
     case AssemblyLine(TSX, _, _, _) :: xs => ErrorReporting.fatal("Unexpected TSX")
     case AssemblyLine(SBX, _, _, _) :: xs => ErrorReporting.fatal("Unexpected SBX")
     case AssemblyLine(SAX, _, _, _) :: xs => ErrorReporting.fatal("Unexpected SAX")
+    case AssemblyLine(SXY, _, _, _) :: xs => ErrorReporting.fatal("Unexpected SXY")
 
     case (x@AssemblyLine(_, AbsoluteX, _, _)) :: xs => x.copy(addrMode = AbsoluteY) :: switchX2Y(xs)
     case (x@AssemblyLine(_, ZeroPageX, _, _)) :: xs => x.copy(addrMode = ZeroPageY) :: switchX2Y(xs)
@@ -197,11 +205,14 @@ class ChangeIndexRegisterOptimization(preferX2Y: Boolean) extends AssemblyOptimi
       :: xs => code.take(3) ++ switchY2X(xs)
     case (x@AssemblyLine(TAY, _, _, _)) :: xs => x.copy(opcode = TAX) :: switchY2X(xs)
     case (x@AssemblyLine(TYA, _, _, _)) :: xs => x.copy(opcode = TXA) :: switchY2X(xs)
+    case (x@AssemblyLine(TYX | TXY, _, _, _)) :: xs => x.copy(opcode = TXY) :: switchY2X(xs) // keep the transfer for the flags
     case (x@AssemblyLine(STY, _, _, _)) :: xs => x.copy(opcode = STX) :: switchY2X(xs)
     case (x@AssemblyLine(LDY, _, _, _)) :: xs => x.copy(opcode = LDX) :: switchY2X(xs)
     case (x@AssemblyLine(INY, _, _, _)) :: xs => x.copy(opcode = INX) :: switchY2X(xs)
     case (x@AssemblyLine(DEY, _, _, _)) :: xs => x.copy(opcode = DEX) :: switchY2X(xs)
     case (x@AssemblyLine(CPY, _, _, _)) :: xs => x.copy(opcode = CPX) :: switchY2X(xs)
+    case (x@AssemblyLine(SAY, _, _, _)) :: xs => x.copy(opcode = HuSAX) :: switchY2X(xs)
+    case AssemblyLine(SXY, _, _, _) :: xs => ErrorReporting.fatal("Unexpected SXY")
 
     case (x@AssemblyLine(_, AbsoluteY, _, _)) :: xs => x.copy(addrMode = AbsoluteX) :: switchY2X(xs)
     case (x@AssemblyLine(_, ZeroPageY, _, _)) :: xs => x.copy(addrMode = ZeroPageX) :: switchY2X(xs)
diff --git a/src/main/scala/millfork/assembly/opt/CmosOptimizations.scala b/src/main/scala/millfork/assembly/opt/CmosOptimizations.scala
index 94434c88..1b40030b 100644
--- a/src/main/scala/millfork/assembly/opt/CmosOptimizations.scala
+++ b/src/main/scala/millfork/assembly/opt/CmosOptimizations.scala
@@ -1,10 +1,10 @@
 package millfork.assembly.opt
 
-import millfork.assembly.{AssemblyLine, Opcode}
+import millfork.assembly.{AssemblyLine, Opcode, State}
 import millfork.assembly.Opcode._
 import millfork.assembly.AddrMode._
 import millfork.assembly.OpcodeClasses._
-import millfork.env.{Constant, NormalFunction}
+import millfork.env._
 
 /**
   * @author Karol Stasiak
@@ -15,22 +15,39 @@ object CmosOptimizations {
 
   val ZeroStoreAsStz = new RuleBasedAssemblyOptimization("Zero store",
     needsFlowInfo = FlowInfoRequirement.ForwardFlow,
-    (HasA(0) & HasOpcode(STA) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
+    (HasA(0) & HasZ(0) & HasOpcode(STA) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
       code.head.copy(opcode = STZ) :: Nil
     },
-    (HasX(0) & HasOpcode(STX) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
+    (HasX(0) & HasZ(0) & HasOpcode(STX) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
       code.head.copy(opcode = STZ) :: Nil
     },
-    (HasY(0) & HasOpcode(STY) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
+    (HasY(0) & HasZ(0) & HasOpcode(STY) & Elidable & HasAddrModeIn(StzAddrModes)) ~~> {code =>
       code.head.copy(opcode = STZ) :: Nil
     },
   )
 
+  val SimplerBitFlipping = new RuleBasedAssemblyOptimization("Simpler bit flipping",
+    needsFlowInfo = FlowInfoRequirement.BackwardFlow,
+    (Elidable & HasOpcode(LDA) & HasAddrModeIn(Set(Absolute, ZeroPage)) & MatchParameter(0)) ~
+      (Elidable & HasOpcode(ORA) & MatchImmediate(1)) ~
+      (Elidable & HasOpcode(STA) & HasAddrModeIn(Set(Absolute, ZeroPage)) & MatchParameter(0) & DoesntMatterWhatItDoesWith(State.A, State.Z, State.N)) ~~> { code =>
+      List(code(1).copy(opcode = Opcode.LDA), code.head.copy(opcode = TSB))
+    },
+    (Elidable & HasOpcode(LDA) & HasAddrModeIn(Set(Absolute, ZeroPage)) & MatchParameter(0)) ~
+      (Elidable & HasOpcode(AND) & MatchImmediate(1)) ~
+      (Elidable & HasOpcode(STA) & HasAddrModeIn(Set(Absolute, ZeroPage)) & MatchParameter(0) & DoesntMatterWhatItDoesWith(State.A, State.Z, State.N)) ~~> { code =>
+      List(
+        code(1).copy(opcode = Opcode.LDA, parameter = CompoundConstant(MathOperator.Exor, NumericConstant(255, 1), code(1).parameter)),
+        code.head.copy(opcode = TRB))
+    },
+  )
+
   val OptimizeZeroIndex = new RuleBasedAssemblyOptimization("Optimizing zero index",
     needsFlowInfo = FlowInfoRequirement.ForwardFlow,
-    (Elidable & HasY(0) & HasAddrMode(IndexedY) & HasOpcodeIn(SupportsZeroPageIndirect)) ~~> (code => code.map(_.copy(addrMode = ZeroPageIndirect))),
-    (Elidable & HasX(0) & HasAddrMode(IndexedX) & HasOpcodeIn(SupportsZeroPageIndirect)) ~~> (code => code.map(_.copy(addrMode = ZeroPageIndirect))),
+    (Elidable & HasY(0) & HasZ(0) & HasAddrMode(IndexedY) & HasOpcodeIn(SupportsIndexedZ)) ~~> (code => code.map(_.copy(addrMode = IndexedZ))),
+    (Elidable & HasX(0) & HasZ(0) & HasAddrMode(IndexedX) & HasOpcodeIn(SupportsIndexedZ)) ~~> (code => code.map(_.copy(addrMode = IndexedZ))),
+    (Elidable & HasX(0) & HasZ(0) & HasAddrMode(AbsoluteIndexedX) & HasOpcode(JMP)) ~~> (code => code.map(_.copy(addrMode = Indirect))),
   )
 
-  val All: List[AssemblyOptimization] = List(OptimizeZeroIndex, ZeroStoreAsStz)
+  val All: List[AssemblyOptimization] = List(OptimizeZeroIndex, SimplerBitFlipping, ZeroStoreAsStz)
 }
diff --git a/src/main/scala/millfork/assembly/opt/CoarseFlowAnalyzer.scala b/src/main/scala/millfork/assembly/opt/CoarseFlowAnalyzer.scala
index d283ed66..93a37d11 100644
--- a/src/main/scala/millfork/assembly/opt/CoarseFlowAnalyzer.scala
+++ b/src/main/scala/millfork/assembly/opt/CoarseFlowAnalyzer.scala
@@ -1,5 +1,6 @@
 package millfork.assembly.opt
 
+import millfork.{CompilationFlag, CompilationOptions}
 import millfork.assembly.{AssemblyLine, OpcodeClasses, State}
 import millfork.env.{Label, MemoryAddressConstant, NormalFunction, NumericConstant}
 
@@ -27,6 +28,12 @@ sealed trait Status[T] {
 
 object Status {
 
+  implicit class BoolStatusOps(val inner: Status[Boolean]) extends AnyVal {
+    def withHiddenHi: Status[Boolean] = inner match {
+      case SingleStatus(false) => inner
+      case _ => AnyStatus()
+    }
+  }
   implicit class IntStatusOps(val inner: Status[Int]) extends AnyVal {
     def map[T](f: Int => T): Status[T] = inner match {
       case SingleStatus(x) => SingleStatus(f(x))
@@ -88,11 +95,14 @@ case class AnyStatus[T]() extends Status[T] {
 case class CpuStatus(a: Status[Int] = UnknownStatus(),
                      x: Status[Int] = UnknownStatus(),
                      y: Status[Int] = UnknownStatus(),
+                     iz: Status[Int] = UnknownStatus(),
                      z: Status[Boolean] = UnknownStatus(),
                      n: Status[Boolean] = UnknownStatus(),
                      c: Status[Boolean] = UnknownStatus(),
                      v: Status[Boolean] = UnknownStatus(),
                      d: Status[Boolean] = UnknownStatus(),
+                     m: Status[Boolean] = UnknownStatus(),
+                     w: Status[Boolean] = UnknownStatus()
                     ) {
 
   override def toString: String = s"A=$a,X=$x,Y=$y,Z=$z,N=$n,C=$c,V=$v,D=$d"
@@ -107,47 +117,65 @@ case class CpuStatus(a: Status[Int] = UnknownStatus(),
     a = this.a ~ that.a,
     x = this.x ~ that.x,
     y = this.y ~ that.y,
+    iz = this.iz ~ that.iz,
     z = this.z ~ that.z,
     n = this.n ~ that.n,
     c = this.c ~ that.c,
     v = this.v ~ that.v,
     d = this.d ~ that.d,
+    m = this.m ~ that.m,
+    w = this.w ~ that.w,
   )
 
   def hasClear(state: State.Value): Boolean = state match {
     case State.A => a.contains(0)
     case State.X => x.contains(0)
     case State.Y => y.contains(0)
+    case State.IZ => iz.contains(0)
     case State.Z => z.contains(false)
     case State.N => n.contains(false)
     case State.C => c.contains(false)
     case State.V => v.contains(false)
     case State.D => d.contains(false)
+    case State.M => m.contains(false)
+    case State.W => w.contains(false)
+    case _ => false
   }
 
   def hasSet(state: State.Value): Boolean = state match {
     case State.A => false
     case State.X => false
     case State.Y => false
+    case State.IZ => false
     case State.Z => z.contains(true)
     case State.N => n.contains(true)
     case State.C => c.contains(true)
     case State.V => v.contains(true)
     case State.D => d.contains(true)
+    case State.M => m.contains(true)
+    case State.W => w.contains(true)
+    case _ => false
   }
 }
 
 object CoarseFlowAnalyzer {
   //noinspection RedundantNewCaseClass
-  def analyze(f: NormalFunction, code: List[AssemblyLine]): List[CpuStatus] = {
-    val flagArray = Array.fill[CpuStatus](code.length)(CpuStatus())
+  def analyze(f: NormalFunction, code: List[AssemblyLine], compilationOptions: CompilationOptions): List[CpuStatus] = {
+    val emptyIz: Status[Int] = if (compilationOptions.flag(CompilationFlag.Emit65CE02Opcodes)) UnknownStatus() else SingleStatus(0)
+    val emptyStatus = CpuStatus(iz = emptyIz)
+    val flagArray = Array.fill[CpuStatus](code.length)(emptyStatus)
     val codeArray = code.toArray
-    val initialStatus = new CpuStatus(d = SingleStatus(false))
+    val initialStatus = new CpuStatus(
+      d = SingleStatus(false),
+      m = SingleStatus(true),
+      w = SingleStatus(true),
+      iz = emptyIz
+    )
 
     var changed = true
     while (changed) {
       changed = false
-      var currentStatus: CpuStatus = if (f.interrupt) CpuStatus() else initialStatus
+      var currentStatus: CpuStatus = if (f.interrupt) emptyStatus else initialStatus
       for (i <- codeArray.indices) {
         import millfork.assembly.Opcode._
         import millfork.assembly.AddrMode._
@@ -161,7 +189,7 @@ object CoarseFlowAnalyzer {
             currentStatus = codeArray.indices.flatMap(j => codeArray(j) match {
               case AssemblyLine(_, _, MemoryAddressConstant(Label(L)), _) => Some(flagArray(j))
               case _ => None
-            }).fold(CpuStatus())(_ ~ _)
+            }).fold(emptyStatus)(_ ~ _)
 
           case AssemblyLine(BCC, _, _, _) =>
             currentStatus = currentStatus.copy(c = currentStatus.c ~ SingleStatus(true))
@@ -191,6 +219,25 @@ object CoarseFlowAnalyzer {
           case AssemblyLine(CLV, _, _, _) =>
             currentStatus = currentStatus.copy(v = SingleStatus(false))
 
+          case AssemblyLine(REP, Immediate, NumericConstant(nn, _), _) =>
+            if ((nn & 1) != 0) currentStatus = currentStatus.copy(c = SingleStatus(false))
+            if ((nn & 2) != 0) currentStatus = currentStatus.copy(z = SingleStatus(false))
+            if ((nn & 8) != 0) currentStatus = currentStatus.copy(d = SingleStatus(false))
+            if ((nn & 0x10) != 0) currentStatus = currentStatus.copy(w = SingleStatus(false))
+            if ((nn & 0x20) != 0) currentStatus = currentStatus.copy(m = SingleStatus(false))
+            if ((nn & 0x40) != 0) currentStatus = currentStatus.copy(v = SingleStatus(false))
+            if ((nn & 0x80) != 0) currentStatus = currentStatus.copy(n = SingleStatus(false))
+          case AssemblyLine(SEP, Immediate, NumericConstant(nn, _), _) =>
+            if ((nn & 1) != 0) currentStatus = currentStatus.copy(c = SingleStatus(true))
+            if ((nn & 2) != 0) currentStatus = currentStatus.copy(z = SingleStatus(true))
+            if ((nn & 8) != 0) currentStatus = currentStatus.copy(d = SingleStatus(true))
+            if ((nn & 0x10) != 0) currentStatus = currentStatus.copy(w = SingleStatus(true))
+            if ((nn & 0x20) != 0) currentStatus = currentStatus.copy(m = SingleStatus(true))
+            if ((nn & 0x40) != 0) currentStatus = currentStatus.copy(v = SingleStatus(true))
+            if ((nn & 0x80) != 0) currentStatus = currentStatus.copy(n = SingleStatus(true))
+          case AssemblyLine(XCE, _, _, _) =>
+            currentStatus = currentStatus.copy(c = AnyStatus(), m = AnyStatus(), x = AnyStatus())
+
           case AssemblyLine(JSR, _, _, _) =>
             currentStatus = initialStatus
 
@@ -203,9 +250,9 @@ object CoarseFlowAnalyzer {
           case AssemblyLine(LDA, Immediate, NumericConstant(nn, _), _) =>
             val n = nn.toInt & 0xff
             currentStatus = currentStatus.nz(n).copy(a = SingleStatus(n))
-          case AssemblyLine(LAX, Immediate, NumericConstant(nn, _), _) =>
+          case AssemblyLine(LDZ, Immediate, NumericConstant(nn, _), _) =>
             val n = nn.toInt & 0xff
-            currentStatus = currentStatus.nz(n).copy(a = SingleStatus(n), x = SingleStatus(n))
+            currentStatus = currentStatus.nz(n).copy(iz = SingleStatus(n))
 
           case AssemblyLine(ADC, Immediate, NumericConstant(nn, _), _) =>
             val n = nn.toInt
@@ -231,6 +278,21 @@ object CoarseFlowAnalyzer {
               c = currentStatus.a.map(i => (i & n & 1) == 0),
               a = currentStatus.a.map(i => (i & n & 0xff) >> 1))
 
+
+          case AssemblyLine(ADC_W, WordImmediate, NumericConstant(nn, _), _) =>
+            val n = nn.toInt & 0xff
+            val newA = currentStatus.a.adc(n, currentStatus.c, currentStatus.d)
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = newA.z().withHiddenHi, a = newA, c = AnyStatus(), v = AnyStatus())
+          case AssemblyLine(EOR_W, WordImmediate, NumericConstant(nn, _), _) =>
+            val n = nn.toInt & 0xff
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.a.z(_ ^ n).withHiddenHi, a = currentStatus.a.map(_ ^ n))
+          case AssemblyLine(AND_W, WordImmediate, NumericConstant(nn, _), _) =>
+            val n = nn.toInt & 0xff
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.a.z(_ & n).withHiddenHi, a = currentStatus.a.map(_ & n))
+          case AssemblyLine(ORA_W, WordImmediate, NumericConstant(nn, _), _) =>
+            val n = nn.toInt & 0xff
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.a.z(_ | n).withHiddenHi, a = currentStatus.a.map(_ | n))
+
           case AssemblyLine(INX, Implied, _, _) =>
             currentStatus = currentStatus.copy(n = currentStatus.x.n(_ + 1), z = currentStatus.x.z(_ + 1), x = currentStatus.x.map(v => (v + 1) & 0xff))
           case AssemblyLine(DEX, Implied, _, _) =>
@@ -243,6 +305,22 @@ object CoarseFlowAnalyzer {
             currentStatus = currentStatus.copy(n = currentStatus.a.n(_ + 1), z = currentStatus.a.z(_ + 1), a = currentStatus.a.map(v => (v + 1) & 0xff))
           case AssemblyLine(DEC, Implied, _, _) =>
             currentStatus = currentStatus.copy(n = currentStatus.a.n(_ - 1), z = currentStatus.a.z(_ - 1), a = currentStatus.a.map(v => (v - 1) & 0xff))
+          case AssemblyLine(NEG, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = currentStatus.a.n(256 - _), z = currentStatus.a.z(256 - _), a = currentStatus.a.map(v => (256 - v) & 0xff))
+
+          case AssemblyLine(INX_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.x.z(_ + 1).withHiddenHi, x = currentStatus.x.map(v => (v + 1) & 0xff))
+          case AssemblyLine(DEX_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.x.z(_ - 1).withHiddenHi, x = currentStatus.x.map(v => (v - 1) & 0xff))
+          case AssemblyLine(INY_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.y.z(_ + 1).withHiddenHi, y = currentStatus.y.map(v => (v + 1) & 0xff))
+          case AssemblyLine(DEY_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.y.z(_ - 1).withHiddenHi, y = currentStatus.y.map(v => (v - 1) & 0xff))
+          case AssemblyLine(INC_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.a.z(_ + 1).withHiddenHi, a = currentStatus.a.map(v => (v + 1) & 0xff))
+          case AssemblyLine(DEC_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(n = AnyStatus(), z = currentStatus.a.z(_ - 1).withHiddenHi, a = currentStatus.a.map(v => (v - 1) & 0xff))
+
           case AssemblyLine(TAX, _, _, _) =>
             currentStatus = currentStatus.copy(x = currentStatus.a, n = currentStatus.a.n(), z = currentStatus.a.z())
           case AssemblyLine(TXA, _, _, _) =>
@@ -251,11 +329,22 @@ object CoarseFlowAnalyzer {
             currentStatus = currentStatus.copy(y = currentStatus.a, n = currentStatus.a.n(), z = currentStatus.a.z())
           case AssemblyLine(TYA, _, _, _) =>
             currentStatus = currentStatus.copy(a = currentStatus.y, n = currentStatus.y.n(), z = currentStatus.y.z())
+          case AssemblyLine(TAZ, _, _, _) =>
+            currentStatus = currentStatus.copy(iz = currentStatus.a, n = currentStatus.a.n(), z = currentStatus.a.z())
+          case AssemblyLine(TZA, _, _, _) =>
+            currentStatus = currentStatus.copy(a = currentStatus.iz, n = currentStatus.iz.n(), z = currentStatus.iz.z())
 
           case AssemblyLine(ASL, Implied, _, _) =>
             currentStatus = currentStatus.copy(a = currentStatus.a.map(v => (v << 1) & 0xff), n = currentStatus.a.n(_ << 1), z = currentStatus.a.z(_ << 1),c = currentStatus.a.map(a => a.&(0xff).!=(0)))
           case AssemblyLine(LSR, Implied, _, _) =>
             currentStatus = currentStatus.copy(a = currentStatus.a.map(a => a.>>(1).&(0x7f)), n = currentStatus.a.n(a => a.>>(1).&(0x7f)), z = currentStatus.a.z(a => a.>>(1).&(0x7f)),c = currentStatus.a.map(a => a.&(1).!=(0)))
+          case AssemblyLine(ASR, Implied, _, _) =>
+            currentStatus = currentStatus.copy(a = currentStatus.a.map(a => a.toByte.>>(1).&(0xff)), n = currentStatus.a.n(a => a.toByte.>>(1).&(0xff)), z = currentStatus.a.z(a => a.toByte.>>(1).&(0xff)),c = currentStatus.a.map(a => a.&(1).!=(0)))
+
+          case AssemblyLine(ASL_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(a = currentStatus.a.map(v => (v << 1) & 0xff), n = AnyStatus(), z = currentStatus.a.z(_ << 1).withHiddenHi, c = AnyStatus())
+          case AssemblyLine(LSR_W, Implied, _, _) =>
+            currentStatus = currentStatus.copy(a = AnyStatus(), n = AnyStatus(), z = currentStatus.a.z(a => a.>>(1).&(0x7f)).withHiddenHi, c = currentStatus.a.map(a => a.&(1).!=(0)))
 
           case AssemblyLine(opcode, addrMode, parameter, _) =>
             if (OpcodeClasses.ChangesX(opcode)) currentStatus = currentStatus.copy(x = AnyStatus())
diff --git a/src/main/scala/millfork/assembly/opt/FlowAnalyzer.scala b/src/main/scala/millfork/assembly/opt/FlowAnalyzer.scala
index f11165bf..09a4b4d9 100644
--- a/src/main/scala/millfork/assembly/opt/FlowAnalyzer.scala
+++ b/src/main/scala/millfork/assembly/opt/FlowAnalyzer.scala
@@ -39,7 +39,7 @@ object FlowAnalyzer {
         if (options.flag(CompilationFlag.DetailedFlowAnalysis)) {
           () => QuantumFlowAnalyzer.analyze(f, code).map(_.collapse)
         } else {
-          () => CoarseFlowAnalyzer.analyze(f, code)
+          () => CoarseFlowAnalyzer.analyze(f, code, options)
         }
       case FlowInfoRequirement.BackwardFlow | FlowInfoRequirement.JustLabels | FlowInfoRequirement.NoRequirement =>
         () => List.fill(code.size)(EmptyCpuStatus)
diff --git a/src/main/scala/millfork/assembly/opt/HudsonOptimizations.scala b/src/main/scala/millfork/assembly/opt/HudsonOptimizations.scala
new file mode 100644
index 00000000..38e90b43
--- /dev/null
+++ b/src/main/scala/millfork/assembly/opt/HudsonOptimizations.scala
@@ -0,0 +1,21 @@
+package millfork.assembly.opt
+
+import millfork.assembly.AssemblyLine
+import millfork.assembly.Opcode._
+import millfork.assembly.AddrMode._
+import millfork.env.NumericConstant
+
+/**
+  * @author Karol Stasiak
+  */
+object HudsonOptimizations {
+
+  val All: List[AssemblyOptimization] = List()
+
+  def removeLoadZero(code: List[AssemblyLine]): List[AssemblyLine] = code.map{
+    case AssemblyLine(LDA, Immediate, NumericConstant(0, _), true) => AssemblyLine.implied(CLA)
+    case AssemblyLine(LDX, Immediate, NumericConstant(0, _), true) => AssemblyLine.implied(CLX)
+    case AssemblyLine(LDY, Immediate, NumericConstant(0, _), true) => AssemblyLine.implied(CLY)
+    case l => l
+  }
+}
diff --git a/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala b/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala
index a579de64..5069c99d 100644
--- a/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala
+++ b/src/main/scala/millfork/assembly/opt/LoopUnrolling.scala
@@ -90,7 +90,7 @@ object LoopUnrolling {
     (Elidable & HasOpcode(LDX) & MatchNumericImmediate(Start)).capture(Initialization) ~
       (Elidable & HasOpcode(BEQ) & MatchParameter(Skip)) ~
       (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~
-      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesX)).*.capture(Body) ~
+      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI, RTL))) & Not(ChangesX)).*.capture(Body) ~
         (Elidable & HasOpcodeIn(Set(DEX, INX))).capture(Step)
         ).capture(BodyWithStep) ~
       (Elidable & HasOpcode(CPX) & MatchNumericImmediate(End)).? ~
@@ -104,7 +104,7 @@ object LoopUnrolling {
     },
     (Elidable & HasOpcode(LDX) & MatchNumericImmediate(Start)).capture(Initialization) ~
       (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~
-      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesX)).*.capture(Body) ~
+      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI, RTL))) & Not(ChangesX)).*.capture(Body) ~
         (Elidable & HasOpcodeIn(Set(DEX, INX))).capture(Step)
         ).capture(BodyWithStep) ~
       (Elidable & HasOpcode(CPX) & MatchNumericImmediate(End)).? ~
@@ -118,7 +118,7 @@ object LoopUnrolling {
     (Elidable & HasOpcode(LDY) & MatchNumericImmediate(Start)).capture(Initialization) ~
       (Elidable & HasOpcode(BEQ) & MatchParameter(Skip)) ~
       (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~
-      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesY)).*.capture(Body) ~
+      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI, RTL))) & Not(ChangesY)).*.capture(Body) ~
         (Elidable & HasOpcodeIn(Set(DEY, INY))).capture(Step)
         ).capture(BodyWithStep) ~
       (Elidable & HasOpcode(CPY) & MatchNumericImmediate(End)).? ~
@@ -132,7 +132,7 @@ object LoopUnrolling {
     },
     (Elidable & HasOpcode(LDY) & MatchNumericImmediate(Start)).capture(Initialization) ~
       (Elidable & HasOpcode(LABEL) & MatchParameter(Back)) ~
-      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI))) & Not(ChangesY)).*.capture(Body) ~
+      ((Elidable & Not(HasOpcodeIn(Set(RTS, JSR, RTI, RTL))) & Not(ChangesY)).*.capture(Body) ~
         (Elidable & HasOpcodeIn(Set(DEY, INY))).capture(Step)
         ).capture(BodyWithStep) ~
       (Elidable & HasOpcode(CPY) & MatchNumericImmediate(End)).? ~
diff --git a/src/main/scala/millfork/assembly/opt/ReverseFlowAnalyzer.scala b/src/main/scala/millfork/assembly/opt/ReverseFlowAnalyzer.scala
index ec690895..14b4d7ba 100644
--- a/src/main/scala/millfork/assembly/opt/ReverseFlowAnalyzer.scala
+++ b/src/main/scala/millfork/assembly/opt/ReverseFlowAnalyzer.scala
@@ -1,7 +1,9 @@
 package millfork.assembly.opt
 
+import millfork.CompilationOptions
 import millfork.assembly.{AssemblyLine, Opcode, OpcodeClasses, State}
 import millfork.env._
+import millfork.error.ErrorReporting
 import millfork.node.Register
 
 import scala.collection.immutable
@@ -33,13 +35,17 @@ case object UnknownImportance extends Importance {
 
 //noinspection RedundantNewCaseClass
 case class CpuImportance(a: Importance = UnknownImportance,
+                         ah: Importance = UnknownImportance,
                          x: Importance = UnknownImportance,
                          y: Importance = UnknownImportance,
+                         iz: Importance = UnknownImportance,
                          n: Importance = UnknownImportance,
                          z: Importance = UnknownImportance,
                          v: Importance = UnknownImportance,
                          c: Importance = UnknownImportance,
                          d: Importance = UnknownImportance,
+                         m: Importance = UnknownImportance,
+                         w: Importance = UnknownImportance,
                         ) {
   override def toString: String = s"A=$a,X=$x,Y=$y,Z=$z,N=$n,C=$c,V=$v,D=$d"
 
@@ -47,38 +53,48 @@ case class CpuImportance(a: Importance = UnknownImportance,
     a = this.a ~ that.a,
     x = this.x ~ that.x,
     y = this.y ~ that.y,
+    iz = this.iz ~ that.iz,
     z = this.z ~ that.z,
     n = this.n ~ that.n,
     c = this.c ~ that.c,
     v = this.v ~ that.v,
     d = this.d ~ that.d,
+    m = this.m ~ that.m,
+    w = this.w ~ that.w,
   )
 
   def isUnimportant(state: State.Value): Boolean = state match {
       // UnknownImportance is usually an effect of unreachable code
     case State.A => a != Important
+    case State.AH => ah != Important
     case State.X => x != Important
     case State.Y => y != Important
+    case State.IZ => iz != Important
     case State.Z => z != Important
     case State.N => n != Important
     case State.C => c != Important
     case State.V => v != Important
     case State.D => d != Important
+    case State.M => m != Important
+    case State.W => w != Important
   }
 }
 
 object ReverseFlowAnalyzer {
 
-  val aluAdders = Set(Opcode.ADC, Opcode.SBC, Opcode.ISC, Opcode.DCP)
+  val aluAdders = Set(Opcode.ADC, Opcode.SBC, Opcode.ISC, Opcode.DCP, Opcode.ADC_W, Opcode.SBC_W)
 
   //noinspection RedundantNewCaseClass
   def analyze(f: NormalFunction, code: List[AssemblyLine]): List[CpuImportance] = {
     val importanceArray = Array.fill[CpuImportance](code.length)(new CpuImportance())
     val codeArray = code.toArray
-    val initialStatus = new CpuStatus(d = SingleStatus(false))
 
     var changed = true
-    val finalImportance = new CpuImportance(a = Important, x = Important, y = Important, c = Important, v = Important, d = Important, z = Important, n = Important)
+    val finalImportance = new CpuImportance(
+      a = Important, ah = Important,
+      x = Important, y = Important, iz = Important,
+      c = Important, v = Important, d = Important, z = Important, n = Important,
+      m = Important, w = Important)
     changed = true
     while (changed) {
       changed = false
@@ -91,7 +107,7 @@ object ReverseFlowAnalyzer {
           importanceArray(i) = currentImportance
         }
         codeArray(i) match {
-          case AssemblyLine(opcode, Relative, MemoryAddressConstant(Label(l)), _) if OpcodeClasses.ShortBranching(opcode) =>
+          case AssemblyLine(opcode, Relative | LongRelative, MemoryAddressConstant(Label(l)), _) if OpcodeClasses.ShortConditionalBranching(opcode) =>
             val L = l
             val labelIndex = codeArray.indexWhere {
               case AssemblyLine(LABEL, _, MemoryAddressConstant(Label(L)), _) => true
@@ -101,21 +117,27 @@ object ReverseFlowAnalyzer {
           case _ =>
         }
         codeArray(i) match {
-          case AssemblyLine(JSR | JMP, Absolute, MemoryAddressConstant(fun:FunctionInMemory), _) =>
+          case AssemblyLine(JSR | JMP, Absolute | LongAbsolute, MemoryAddressConstant(fun:FunctionInMemory), _) =>
             var result = new CpuImportance(
               a = Unimportant,
+              ah = Unimportant,
               x = Unimportant,
               y = Unimportant,
+              iz = Unimportant,
               z = Unimportant,
               n = Unimportant,
               c = Unimportant,
               v = Unimportant,
-              d = Important)
+              d = Important,
+              m = Important,
+              w = Important)
             fun.params match {
               case AssemblyParamSignature(params) =>
                 params.foreach(_.variable match {
                   case RegisterVariable(Register.A, _) =>
                     result = result.copy(a = Important)
+                  case RegisterVariable(Register.AW, _) =>
+                    result = result.copy(a = Important, ah = Important)
                   case RegisterVariable(Register.X, _) =>
                     result = result.copy(x = Important)
                   case RegisterVariable(Register.Y, _) =>
@@ -131,16 +153,16 @@ object ReverseFlowAnalyzer {
               case _ =>
             }
             currentImportance = result
-          case AssemblyLine(JSR | BRK, _, _, _) =>
+          case AssemblyLine(JSR | BRK | COP, _, _, _) =>
             currentImportance = finalImportance
-          case AssemblyLine(JMP | BRA, Absolute | Relative, MemoryAddressConstant(Label(l)), _) =>
+          case AssemblyLine(JMP | BRA, Absolute | Relative | LongAbsolute | LongRelative, MemoryAddressConstant(Label(l)), _) =>
             val L = l
             val labelIndex = codeArray.indexWhere {
               case AssemblyLine(LABEL, _, MemoryAddressConstant(Label(L)), _) => true
               case _ => false
             }
             currentImportance = if (labelIndex < 0) finalImportance else importanceArray(labelIndex)
-          case AssemblyLine(JMP, Indirect | AbsoluteIndexedX, _, _) =>
+          case AssemblyLine(JMP, Indirect | AbsoluteIndexedX | LongIndirect, _, _) =>
             currentImportance = finalImportance
           case AssemblyLine(BNE | BEQ, _, _, _) =>
             currentImportance = currentImportance.copy(z = Important)
@@ -148,16 +170,50 @@ object ReverseFlowAnalyzer {
             currentImportance = currentImportance.copy(n = Important)
           case AssemblyLine(SED | CLD, _, _, _) =>
             currentImportance = currentImportance.copy(d = Unimportant)
-          case AssemblyLine(RTS, _, _, _) =>
+          case AssemblyLine(RTS | RTL, _, _, _) =>
             currentImportance = finalImportance
+          case AssemblyLine(TAX, _, _, _) =>
+            currentImportance = currentImportance.copy(a = currentImportance.x ~ currentImportance.a ~ currentImportance.n ~ currentImportance.z, x = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TAY, _, _, _) =>
+            currentImportance = currentImportance.copy(a = currentImportance.y ~ currentImportance.a ~ currentImportance.n ~ currentImportance.z, y = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TXA, _, _, _) =>
+            currentImportance = currentImportance.copy(x = currentImportance.a ~ currentImportance.x ~ currentImportance.n ~ currentImportance.z, a = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TYA, _, _, _) =>
+            currentImportance = currentImportance.copy(y = currentImportance.a ~ currentImportance.y ~ currentImportance.n ~ currentImportance.z, a = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TAZ, _, _, _) =>
+            currentImportance = currentImportance.copy(a = currentImportance.iz ~ currentImportance.a ~ currentImportance.n ~ currentImportance.z, iz = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TZA, _, _, _) =>
+            currentImportance = currentImportance.copy(iz = currentImportance.a ~ currentImportance.iz ~ currentImportance.n ~ currentImportance.z, a = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TXY, _, _, _) =>
+            currentImportance = currentImportance.copy(x = currentImportance.y ~ currentImportance.x ~ currentImportance.n ~ currentImportance.z, y = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(TYX, _, _, _) =>
+            currentImportance = currentImportance.copy(y = currentImportance.x ~ currentImportance.y ~ currentImportance.n ~ currentImportance.z, x = Unimportant, n = Unimportant, z = Unimportant, m = Important, w = Important)
+          case AssemblyLine(HuSAX, _, _, _) =>
+            currentImportance = currentImportance.copy(a = currentImportance.x, x = currentImportance.a, m = Important, w = Important)
+          case AssemblyLine(SAY, _, _, _) =>
+            currentImportance = currentImportance.copy(y = currentImportance.a, a = currentImportance.y, m = Important, w = Important)
+          case AssemblyLine(SXY, _, _, _) =>
+            currentImportance = currentImportance.copy(y = currentImportance.x, x = currentImportance.y, m = Important, w = Important)
           case AssemblyLine(RTI, _, _, _) =>
-            currentImportance = new CpuImportance(a = Unimportant, x = Unimportant, y = Unimportant, z = Unimportant, n = Unimportant, c = Unimportant, v = Unimportant, d = Unimportant)
+            currentImportance = new CpuImportance(
+              a = Unimportant, ah = Unimportant,
+              x = Unimportant, y = Unimportant, iz = Unimportant,
+              z = Unimportant, n = Unimportant, c = Unimportant, v = Unimportant, d = Unimportant,
+              m = Unimportant, w = Unimportant)
           case AssemblyLine(DISCARD_XF, _, _, _) =>
             currentImportance = currentImportance.copy(x = Unimportant, n = Unimportant, z = Unimportant, c = Unimportant, v = Unimportant)
           case AssemblyLine(DISCARD_YF, _, _, _) =>
-            currentImportance = currentImportance.copy(y = Unimportant, n = Unimportant, z = Unimportant, c = Unimportant, v = Unimportant)
+            currentImportance = currentImportance.copy(y = Unimportant, iz = Unimportant, n = Unimportant, z = Unimportant, c = Unimportant, v = Unimportant)
           case AssemblyLine(DISCARD_AF, _, _, _) =>
             currentImportance = currentImportance.copy(a = Unimportant, n = Unimportant, z = Unimportant, c = Unimportant, v = Unimportant)
+          case AssemblyLine(REP | SEP, _, NumericConstant(n, _), _) =>
+            if ((n & 1) != 0) currentImportance = currentImportance.copy(c = Unimportant)
+            if ((n & 2) != 0) currentImportance = currentImportance.copy(z = Unimportant)
+            if ((n & 8) != 0) currentImportance = currentImportance.copy(d = Unimportant)
+            if ((n & 0x10) != 0) currentImportance = currentImportance.copy(w = Unimportant)
+            if ((n & 0x20) != 0) currentImportance = currentImportance.copy(m = Unimportant)
+            if ((n & 0x40) != 0) currentImportance = currentImportance.copy(v = Unimportant)
+            if ((n & 0x80) != 0) currentImportance = currentImportance.copy(n = Unimportant)
           case AssemblyLine(opcode, addrMode, _, _) =>
             val reallyIgnoreC =
               currentImportance.c == Unimportant &&
@@ -177,18 +233,27 @@ object ReverseFlowAnalyzer {
             if (OpcodeClasses.ChangesV(opcode)) currentImportance = currentImportance.copy(v = Unimportant)
             if (OpcodeClasses.ChangesNAndZ(opcode)) currentImportance = currentImportance.copy(n = Unimportant, z = Unimportant)
             if (OpcodeClasses.OverwritesA(opcode)) currentImportance = currentImportance.copy(a = Unimportant)
+            if (OpcodeClasses.OverwritesAH(opcode)) currentImportance = currentImportance.copy(ah = Unimportant)
             if (OpcodeClasses.OverwritesX(opcode)) currentImportance = currentImportance.copy(x = Unimportant)
             if (OpcodeClasses.OverwritesY(opcode)) currentImportance = currentImportance.copy(y = Unimportant)
+            if (OpcodeClasses.OverwritesIZ(opcode)) currentImportance = currentImportance.copy(iz = Unimportant)
             if (OpcodeClasses.ReadsC(opcode) && !reallyIgnoreC) currentImportance = currentImportance.copy(c = Important)
             if (OpcodeClasses.ReadsD(opcode)) currentImportance = currentImportance.copy(d = Important)
             if (OpcodeClasses.ReadsV(opcode)) currentImportance = currentImportance.copy(v = Important)
             if (OpcodeClasses.ReadsXAlways(opcode)) currentImportance = currentImportance.copy(x = Important)
             if (OpcodeClasses.ReadsYAlways(opcode)) currentImportance = currentImportance.copy(y = Important)
+            if (OpcodeClasses.ReadsIZAlways(opcode)) currentImportance = currentImportance.copy(iz = Important)
+            if (OpcodeClasses.ReadsM(opcode)) currentImportance = currentImportance.copy(m = Important)
+            if (OpcodeClasses.ReadsW(opcode)) currentImportance = currentImportance.copy(w = Important)
             if (OpcodeClasses.ReadsAAlways(opcode) && !reallyIgnoreA) currentImportance = currentImportance.copy(a = Important)
+            if (OpcodeClasses.ReadsAHAlways(opcode)) currentImportance = currentImportance.copy(ah = Important)
             if (OpcodeClasses.ReadsAIfImplied(opcode) && addrMode == Implied) currentImportance = currentImportance.copy(a = Important)
-            if (addrMode == AbsoluteX || addrMode == IndexedX || addrMode == ZeroPageX || addrMode == AbsoluteIndexedX)
+            if (OpcodeClasses.ReadsAHIfImplied(opcode) && addrMode == Implied) currentImportance = currentImportance.copy(ah = Important)
+            if (addrMode == AbsoluteX || addrMode == LongAbsoluteX || addrMode == IndexedX || addrMode == ZeroPageX || addrMode == AbsoluteIndexedX)
               currentImportance = currentImportance.copy(x = Important)
-            if (addrMode == AbsoluteY || addrMode == IndexedY || addrMode == ZeroPageY)
+            if (addrMode == IndexedZ /*|| addrMode == LongIndexedZ*/)
+              currentImportance = currentImportance.copy(iz = Important)
+            if (addrMode == AbsoluteY || addrMode == IndexedY || addrMode == ZeroPageY || addrMode == LongIndexedY || addrMode == IndexedSY)
               currentImportance = currentImportance.copy(y = Important)
         }
       }
diff --git a/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala b/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala
index d694cb18..84c23cc2 100644
--- a/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala
+++ b/src/main/scala/millfork/assembly/opt/RuleBasedAssemblyOptimization.scala
@@ -142,15 +142,16 @@ class AssemblyMatchingContext(val compilationOptions: CompilationOptions) {
     val labels = mutable.Set[String]()
     val jumps = mutable.Set[String]()
     get[List[AssemblyLine]](i).foreach {
-      case AssemblyLine(Opcode.RTS | Opcode.RTI | Opcode.BRK, _, _, _) =>
+      // JSR and BSR are allowed
+      case AssemblyLine(Opcode.RTS | Opcode.RTI | Opcode.RTL | Opcode.BRK, _, _, _) =>
         return false
-      case AssemblyLine(Opcode.JMP, AddrMode.Indirect, _, _) =>
+      case AssemblyLine(Opcode.JMP, AddrMode.Indirect | AddrMode.AbsoluteIndexedX | AddrMode.LongIndirect, _, _) =>
         return false
       case AssemblyLine(Opcode.LABEL, _, MemoryAddressConstant(Label(l)), _) =>
         labels += l
       case AssemblyLine(Opcode.JMP, AddrMode.Absolute, MemoryAddressConstant(Label(l)), _) =>
         jumps += l
-      case AssemblyLine(Opcode.JMP, AddrMode.Absolute, _, _) =>
+      case AssemblyLine(Opcode.JMP, AddrMode.Absolute | AddrMode.LongAbsolute, _, _) =>
         return false
       case AssemblyLine(_, AddrMode.Relative, MemoryAddressConstant(Label(l)), _) =>
         jumps += l
@@ -191,25 +192,29 @@ trait AssemblyPattern {
 }
 object HelperCheckers {
   import AddrMode._
-  private val badAddrModes = Set(IndexedX, IndexedY, ZeroPageIndirect, AbsoluteIndexedX)
-  private val goodAddrModes = Set(Implied, Immediate, Relative)
+  private val badAddrModes = Set(IndexedX, IndexedY, IndexedZ, LongIndexedY, LongIndexedZ, IndexedSY, Indirect, TripleAbsolute, Stack)
+  private val goodAddrModes = Set(Implied, Immediate, WordImmediate, Relative, LongRelative)
 
   def memoryAccessDoesntOverlap(l1: AssemblyLine, l2: AssemblyLine): Boolean = {
-    memoryAccessDoesntOverlap(l1.addrMode, l1.parameter, l2.addrMode, l2.parameter)
-  }
-
-  def memoryAccessDoesntOverlap(a1: AddrMode.Value, p1: Constant, a2: AddrMode.Value, p2: Constant): Boolean = {
+    val a1 = l1.addrMode
+    val a2 = l2.addrMode
     if (badAddrModes(a1) || badAddrModes(a2)) return false
     if (goodAddrModes(a1) || goodAddrModes(a2)) return true
+    if ((a1 == IndexedSY) != (a2 == IndexedSY)) return true // bold assertion, but usually true
+    val p1 = l1.parameter
+    val p2 = l2.parameter
+    val w1 = OpcodeClasses.AccessesWordInMemory(l1.opcode)
+    val w2 = OpcodeClasses.AccessesWordInMemory(l2.opcode)
     def handleKnownDistance(distance: Short): Boolean = {
-      val indexingAddrModes = Set(AbsoluteIndexedX, AbsoluteX, ZeroPageX, AbsoluteY, ZeroPageY)
+      // `distance` is the distance between the first byte that can be addressed by l1 (b1) and the first byte that can be addressed by l2 (b2): (b2-b1)
+      val indexingAddrModes = Set(AbsoluteIndexedX, AbsoluteX, ZeroPageX, AbsoluteY, ZeroPageY, LongAbsoluteX)
       val a1Indexing = indexingAddrModes(a1)
       val a2Indexing = indexingAddrModes(a2)
       (a1Indexing, a2Indexing) match {
-        case (false, false) => distance != 0
-        case (true, false) => distance > 255 || distance < 0
-        case (false, true) => distance > 0 || distance < -255
-        case (true, true) => distance > 255 || distance < -255
+        case (false, false) => distance != 0 && (distance != 1 || !w1) && (distance != -1 || !w2)
+        case (true, false) => distance > 255 || distance < 0 && (distance != 256 || !w1) && (distance != -1 || !w2)
+        case (false, true) => distance > 0 || distance < -255 && (distance != 1 || !w1) && (distance != -256 || !w2)
+        case (true, true) => distance > 255 || distance < -255 && (distance != 265 || !w1) && (distance != -256 || !w2)
       }
     }
 
@@ -436,7 +441,7 @@ case class WhereNoMemoryAccessOverlapBetweenTwoLineLists(ix1: Int, ix2: Int) ext
   override def matchTo(ctx: AssemblyMatchingContext, code: List[(FlowInfo, AssemblyLine)]): Option[List[(FlowInfo, AssemblyLine)]] = {
     val s1s = ctx.get[List[AssemblyLine]](ix1)
     val s2s = ctx.get[List[AssemblyLine]](ix2)
-    if (s1s.forall(s1 => s2s.forall(s2 => HelperCheckers.memoryAccessDoesntOverlap(s1.addrMode, s1.parameter, s2.addrMode, s2.parameter)))) Some(code) else None
+    if (s1s.forall(s1 => s2s.forall(s2 => HelperCheckers.memoryAccessDoesntOverlap(s1, s2)))) Some(code) else None
   }
 }
 
@@ -502,6 +507,14 @@ case class HasY(value: Int) extends AssemblyLinePattern {
     flowInfo.statusBefore.y.contains(value)
 }
 
+case class HasZ(value: Int) extends AssemblyLinePattern {
+  override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
+    FlowInfoRequirement.assertForward(needsFlowInfo)
+
+  override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
+    flowInfo.statusBefore.iz.contains(value)
+}
+
 case class DoesntMatterWhatItDoesWith(states: State.Value*) extends AssemblyLinePattern {
   override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
     FlowInfoRequirement.assertBackward(needsFlowInfo)
@@ -520,6 +533,38 @@ case class HasSet(state: State.Value) extends AssemblyLinePattern {
     flowInfo.hasSet(state)
 }
 
+object HasAccu8 extends AssemblyLinePattern {
+  override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
+    FlowInfoRequirement.assertForward(needsFlowInfo)
+
+  override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
+    flowInfo.hasSet(State.M)
+}
+
+object HasAccu16 extends AssemblyLinePattern {
+  override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
+    FlowInfoRequirement.assertForward(needsFlowInfo)
+
+  override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
+    flowInfo.hasClear(State.M)
+}
+
+object HasIndex8 extends AssemblyLinePattern {
+  override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
+    FlowInfoRequirement.assertForward(needsFlowInfo)
+
+  override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
+    flowInfo.hasSet(State.W)
+}
+
+object HasIndex16 extends AssemblyLinePattern {
+  override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
+    FlowInfoRequirement.assertForward(needsFlowInfo)
+
+  override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
+    flowInfo.hasClear(State.W)
+}
+
 case class HasClear(state: State.Value) extends AssemblyLinePattern {
   override def validate(needsFlowInfo: FlowInfoRequirement.Value): Unit =
     FlowInfoRequirement.assertForward(needsFlowInfo)
@@ -598,6 +643,11 @@ case object ReadsA extends TrivialAssemblyLinePattern {
     OpcodeClasses.ReadsAAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ReadsAIfImplied(line.opcode)
 }
 
+case object ReadsAH extends TrivialAssemblyLinePattern {
+  override def apply(line: AssemblyLine): Boolean =
+    OpcodeClasses.ReadsAHAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ReadsAHIfImplied(line.opcode)
+}
+
 case object ReadsMemory extends TrivialAssemblyLinePattern {
   override def apply(line: AssemblyLine): Boolean =
     line.addrMode match {
@@ -632,38 +682,76 @@ case object ConcernsA extends TrivialAssemblyLinePattern {
     OpcodeClasses.ConcernsAAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ConcernsAIfImplied(line.opcode)
 }
 
+case object ConcernsAH extends TrivialAssemblyLinePattern {
+  override def apply(line: AssemblyLine): Boolean =
+    OpcodeClasses.ConcernsAHAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ConcernsAHIfImplied(line.opcode)
+}
+
 case object ConcernsX extends TrivialAssemblyLinePattern {
-  val XAddrModes = Set(AddrMode.AbsoluteX, AddrMode.IndexedX, AddrMode.ZeroPageX)
+  val XAddrModes = Set(AddrMode.AbsoluteX, AddrMode.AbsoluteIndexedX, AddrMode.LongAbsoluteX, AddrMode.IndexedX, AddrMode.ZeroPageX)
 
   override def apply(line: AssemblyLine): Boolean =
     OpcodeClasses.ConcernsXAlways(line.opcode) || XAddrModes(line.addrMode)
 }
 
 case object ConcernsY extends TrivialAssemblyLinePattern {
-  val YAddrModes = Set(AddrMode.AbsoluteY, AddrMode.IndexedY, AddrMode.ZeroPageY)
+  val YAddrModes = Set(AddrMode.AbsoluteY, AddrMode.IndexedSY, AddrMode.IndexedY, AddrMode.LongIndexedY, AddrMode.ZeroPageY)
 
   override def apply(line: AssemblyLine): Boolean =
     OpcodeClasses.ConcernsYAlways(line.opcode) || YAddrModes(line.addrMode)
 }
 
+case object ConcernsStack extends TrivialAssemblyLinePattern {
+  val SAddrModes = Set(AddrMode.IndexedSY, AddrMode.Stack)
+
+  override def apply(line: AssemblyLine): Boolean =
+    OpcodeClasses.ConcernsStackAlways(line.opcode) || SAddrModes(line.addrMode)
+}
+
+case object ConcernsIZ extends TrivialAssemblyLinePattern {
+  val IZAddrModes = Set(AddrMode.IndexedZ, AddrMode.LongIndexedZ)
+
+  override def apply(line: AssemblyLine): Boolean =
+    OpcodeClasses.ConcernsIZAlways(line.opcode) || IZAddrModes(line.addrMode)
+}
+
 case object ChangesA extends TrivialAssemblyLinePattern {
   override def apply(line: AssemblyLine): Boolean =
     OpcodeClasses.ChangesAAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ChangesAIfImplied(line.opcode)
 }
 
+case object ChangesAH extends TrivialAssemblyLinePattern {
+  override def apply(line: AssemblyLine): Boolean =
+    OpcodeClasses.ChangesAHAlways(line.opcode) || line.addrMode == AddrMode.Implied && OpcodeClasses.ChangesAHIfImplied(line.opcode)
+}
+
+case object ChangesM extends TrivialAssemblyLinePattern {
+  override def apply(line: AssemblyLine): Boolean = line match {
+    case AssemblyLine(Opcode.SEP | Opcode.REP, AddrMode.Immediate, NumericConstant(n, _), _) => (n & 0x20) != 0
+    case AssemblyLine(Opcode.SEP | Opcode.REP | Opcode.PLP | Opcode.XCE, _, _, _) => true
+    case _ => false
+  }
+}
+case object ChangesW extends TrivialAssemblyLinePattern {
+  override def apply(line: AssemblyLine): Boolean = line match {
+    case AssemblyLine(Opcode.SEP | Opcode.REP, AddrMode.Immediate, NumericConstant(n, _), _) => (n & 0x10) != 0
+    case AssemblyLine(Opcode.SEP | Opcode.REP | Opcode.PLP | Opcode.XCE, _, _, _) => true
+    case _ => false
+  }
+}
 case object ChangesMemory extends TrivialAssemblyLinePattern {
   override def apply(line: AssemblyLine): Boolean =
     OpcodeClasses.ChangesMemoryAlways(line.opcode) || line.addrMode != AddrMode.Implied && OpcodeClasses.ChangesMemoryIfNotImplied(line.opcode)
 }
 
-case class DoesntChangeMemoryAt(addrMode1: Int, param1: Int) extends AssemblyLinePattern {
+case class DoesntChangeMemoryAt(addrMode1: Int, param1: Int, opcode: Opcode.Value = Opcode.NOP) extends AssemblyLinePattern {
   override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean = {
     val p1 = ctx.get[Constant](param1)
-    val p2 = line.parameter
     val a1 = ctx.get[AddrMode.Value](addrMode1)
-    val a2 = line.addrMode
     val changesSomeMemory = OpcodeClasses.ChangesMemoryAlways(line.opcode) || line.addrMode != AddrMode.Implied && OpcodeClasses.ChangesMemoryIfNotImplied(line.opcode)
-    !changesSomeMemory || HelperCheckers.memoryAccessDoesntOverlap(a1, p1, a2, p2)
+    // TODO: NOP
+    // this will break if the actual instruction was 16-bit
+    !changesSomeMemory || HelperCheckers.memoryAccessDoesntOverlap(AssemblyLine(opcode, a1, p1), line)
   }
 }
 
@@ -675,10 +763,10 @@ case object ConcernsMemory extends TrivialAssemblyLinePattern {
 case class DoesNotConcernMemoryAt(addrMode1: Int, param1: Int) extends AssemblyLinePattern {
   override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean = {
     val p1 = ctx.get[Constant](param1)
-    val p2 = line.parameter
     val a1 = ctx.get[AddrMode.Value](addrMode1)
-    val a2 = line.addrMode
-    HelperCheckers.memoryAccessDoesntOverlap(a1, p1, a2, p2)
+    // TODO: NOP
+    // this will break if the actual instruction was 16-bit
+    HelperCheckers.memoryAccessDoesntOverlap(AssemblyLine(Opcode.NOP, a1, p1), line)
   }
 }
 
@@ -788,8 +876,11 @@ case class MatchNumericImmediate(i: Int) extends AssemblyLinePattern {
 case class DoesntChangeIndexingInAddrMode(i: Int) extends AssemblyLinePattern {
   override def matchLineTo(ctx: AssemblyMatchingContext, flowInfo: FlowInfo, line: AssemblyLine): Boolean =
     ctx.get[AddrMode.Value](i) match {
-      case AddrMode.ZeroPageX | AddrMode.AbsoluteX | AddrMode.IndexedX | AddrMode.AbsoluteIndexedX => !OpcodeClasses.ChangesX.contains(line.opcode)
-      case AddrMode.ZeroPageY | AddrMode.AbsoluteY | AddrMode.IndexedY => !OpcodeClasses.ChangesY.contains(line.opcode)
+      case AddrMode.ZeroPageX | AddrMode.AbsoluteX | AddrMode.LongAbsoluteX | AddrMode.IndexedX | AddrMode.AbsoluteIndexedX => !OpcodeClasses.ChangesX.contains(line.opcode)
+      case AddrMode.ZeroPageY | AddrMode.AbsoluteY | AddrMode.IndexedY | AddrMode.LongIndexedY => !OpcodeClasses.ChangesY.contains(line.opcode)
+      case AddrMode.IndexedZ | AddrMode.LongIndexedZ => !OpcodeClasses.ChangesIZ.contains(line.opcode)
+      case AddrMode.Stack => !OpcodeClasses.ChangesS.contains(line.opcode)
+      case AddrMode.IndexedSY => !OpcodeClasses.ChangesS.contains(line.opcode) && !OpcodeClasses.ChangesY.contains(line.opcode)
       case _ => true
     }
 
diff --git a/src/main/scala/millfork/assembly/opt/SixteenOptimizations.scala b/src/main/scala/millfork/assembly/opt/SixteenOptimizations.scala
new file mode 100644
index 00000000..785448cf
--- /dev/null
+++ b/src/main/scala/millfork/assembly/opt/SixteenOptimizations.scala
@@ -0,0 +1,195 @@
+package millfork.assembly.opt
+import millfork.assembly.Opcode._
+import millfork.assembly.AddrMode._
+import millfork.assembly.OpcodeClasses._
+import millfork.assembly.{AssemblyLine, OpcodeClasses, State}
+import millfork.env.{Constant, NumericConstant}
+/**
+  * @author Karol Stasiak
+  */
+object SixteenOptimizations {
+
+  val AccumulatorSwapping = new RuleBasedAssemblyOptimization("Accumulator swapping",
+    needsFlowInfo = FlowInfoRequirement.BothFlows,
+    (Elidable & HasOpcode(PHA) & HasAccu8 & DoesntMatterWhatItDoesWith(State.AH, State.A, State.N, State.Z)) ~
+      (Linear & Not(ConcernsStack)) ~
+      (Elidable & HasOpcode(PLA) & DoesntMatterWhatItDoesWith(State.AH)) ~~> { code =>
+      AssemblyLine.implied(XBA) :: (code.tail.init :+ AssemblyLine.implied(XBA))
+    },
+    (Elidable & HasOpcode(TAX) & HasAccu8 & HasIndex8 & DoesntMatterWhatItDoesWith(State.AH, State.A, State.N, State.Z)) ~
+      (Linear & Not(ConcernsX)) ~
+      (Elidable & HasOpcode(TXA) & DoesntMatterWhatItDoesWith(State.AH, State.X)) ~~> { code =>
+      AssemblyLine.implied(XBA) :: (code.tail.init :+ AssemblyLine.implied(XBA))
+    },
+    (Elidable & HasOpcode(TAY) & HasAccu8 & HasIndex8 & DoesntMatterWhatItDoesWith(State.AH, State.A, State.N, State.Z)) ~
+      (Linear & Not(ConcernsY)) ~
+      (Elidable & HasOpcode(TYA) & DoesntMatterWhatItDoesWith(State.AH, State.Y)) ~~> { code =>
+      AssemblyLine.implied(XBA) :: (code.tail.init :+ AssemblyLine.implied(XBA))
+    },
+  )
+
+  val RepSepWeakening = new RuleBasedAssemblyOptimization("REP/SEP weakening",
+    needsFlowInfo = FlowInfoRequirement.BothFlows,
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & HasImmediate(0)) ~~> (_ => Nil),
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.C)) ~
+          Where(c => c.get[Int](0).&(0x1).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xFE
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.Z)) ~
+          Where(c => c.get[Int](0).&(0x2).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xFD
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.D)) ~
+          Where(c => c.get[Int](0).&(0x8).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xF7
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.W)) ~
+          Where(c => c.get[Int](0).&(0x10).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xEF
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.M)) ~
+          Where(c => c.get[Int](0).&(0x20).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xDF
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.V)) ~
+          Where(c => c.get[Int](0).&(0x40).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xBF
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+    (Elidable & HasOpcodeIn(Set(SEP, REP)) & MatchNumericImmediate(0) & DoesntMatterWhatItDoesWith(State.N)) ~
+          Where(c => c.get[Int](0).&(0x80).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0x7F
+      if (i == 0) Nil else List(AssemblyLine.immediate(code.head.opcode, i))
+    },
+
+    (Elidable & HasOpcode(SEP) & HasSet(State.C) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x1).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xFE
+      if (i == 0) Nil else List(AssemblyLine.immediate(SEP, i))
+    },
+    (Elidable & HasOpcode(REP) & HasClear(State.C) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x1).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xFE
+      if (i == 0) Nil else List(AssemblyLine.immediate(REP, i))
+    },
+
+    (Elidable & HasOpcode(SEP) & HasSet(State.W) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x10).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xEF
+      if (i == 0) Nil else List(AssemblyLine.immediate(SEP, i))
+    },
+    (Elidable & HasOpcode(REP) & HasClear(State.W) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x10).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xEF
+      if (i == 0) Nil else List(AssemblyLine.immediate(REP, i))
+    },
+
+    (Elidable & HasOpcode(SEP) & HasSet(State.M) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x20).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xDF
+      if (i == 0) Nil else List(AssemblyLine.immediate(SEP, i))
+    },
+    (Elidable & HasOpcode(REP) & HasClear(State.M) & MatchNumericImmediate(0)) ~
+      Where(c => c.get[Int](0).&(0x20).!=(0)) ~~> { (code, ctx) =>
+      val i = ctx.get[Int](0) & 0xDF
+      if (i == 0) Nil else List(AssemblyLine.immediate(REP, i))
+    },
+  )
+
+
+  val PointlessLoadAfterLoadOrStore = new RuleBasedAssemblyOptimization("Pointless 16-bit load after load or store",
+    needsFlowInfo = FlowInfoRequirement.NoRequirement,
+
+    (HasOpcodeIn(Set(LDA_W, STA_W)) & HasAddrMode(WordImmediate) & MatchParameter(1)) ~
+      (Linear & Not(ChangesA) & Not(ChangesAH)).* ~
+      (Elidable & HasOpcode(LDA_W) & HasAddrMode(WordImmediate) & MatchParameter(1)) ~~> (_.init),
+
+    (HasOpcodeIn(Set(LDA_W, STA_W)) & MatchAddrMode(0) & MatchParameter(1)) ~
+      (Linear & Not(ChangesA) & Not(ChangesAH) & DoesntChangeIndexingInAddrMode(0) & DoesntChangeMemoryAt(0, 1, LDA_W)).* ~
+      (Elidable & HasOpcode(LDA_W) & MatchAddrMode(0) & MatchParameter(1)) ~~> (_.init),
+  )
+
+  val OptimizeZeroIndex = new RuleBasedAssemblyOptimization("Optimizing zero index for far pointers",
+    needsFlowInfo = FlowInfoRequirement.ForwardFlow,
+    (Elidable & HasY(0) /*& HasZ(0)*/ & HasIndex8 & HasAddrMode(LongIndexedY) & HasOpcodeIn(SupportsLongIndexedZ)) ~~> (code => code.map(_.copy(addrMode = LongIndexedZ))),
+  )
+
+  private val SupportsStackAddressing = Set(
+    ADC, AND, EOR, ORA, LDA, STA, SBC, CMP,
+  )
+
+  val OptimizeStackRelative = new RuleBasedAssemblyOptimization("Optimizing stack variables",
+    needsFlowInfo = FlowInfoRequirement.BothFlows,
+
+    ((Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)).*.captureLength(60) ~
+      (Elidable & HasOpcode(DEX) & DoesntMatterWhatItDoesWith(State.N, State.Z)).*.captureLength(61)).capture(3) ~
+      (Not(ConcernsX) & Not(ChangesS)).*.capture(2) ~
+      (Elidable & SupportsStackAddressing & HasAddrMode(AbsoluteX) & MatchParameter(1) & DoesntMatterWhatItDoesWith(State.X)) ~
+      Where(ctx => ctx.get[Constant](1) match {
+        case NumericConstant(x, _) => x >= 0x100 && x <= 0x1ff
+        case _ => false
+      }) ~~> { (code, ctx) =>
+      ctx.get[List[AssemblyLine]](2) ++ List(
+        code.last.copy(addrMode = Stack, parameter = (ctx.get[Constant](1) + ctx.get[Int](60) - ctx.get[Int](61)-0x100).quickSimplify)
+      )
+    },
+
+    ((Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)).*.captureLength(60) ~
+      (Elidable & HasOpcode(DEX) & DoesntMatterWhatItDoesWith(State.N, State.Z)).*.captureLength(61)).capture(3) ~
+      (Not(ConcernsX) & Not(ChangesS)).*.capture(2) ~
+      (Elidable & SupportsStackAddressing & HasAddrMode(AbsoluteX) & MatchParameter(1) & DoesntMatterWhatItDoesWith(State.N, State.Z)) ~
+      Where(ctx => ctx.get[Constant](1) match {
+        case NumericConstant(x, _) => x >= 0x100 && x <= 0x1ff
+        case _ => false
+      }) ~~> { (code, ctx) =>
+      ctx.get[List[AssemblyLine]](2) ++ List(
+        code.last.copy(addrMode = Stack, parameter = (ctx.get[Constant](1) + ctx.get[Int](60) - ctx.get[Int](61)-0x100).quickSimplify)
+      ) ++ ctx.get[List[AssemblyLine]](3)
+    },
+
+    ((Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)).*.captureLength(60) ~
+      (Elidable & HasOpcode(DEX) & DoesntMatterWhatItDoesWith(State.N, State.Z)).*.captureLength(61)).capture(3) ~
+      (Not(ConcernsX) & Not(ChangesS)).*.capture(2) ~
+      (Elidable & HasOpcode(LDY) & HasAddrMode(AbsoluteX) & MatchParameter(1) & DoesntMatterWhatItDoesWith(State.X, State.A)) ~
+      Where(ctx => ctx.get[Constant](1) match {
+        case NumericConstant(x, _) => x >= 0x100 && x <= 0x1ff
+        case _ => false
+      }) ~~> { (code, ctx) =>
+      ctx.get[List[AssemblyLine]](2) ++ List(
+        AssemblyLine(LDA, Stack,(ctx.get[Constant](1) + ctx.get[Int](60) - ctx.get[Int](61)-0x100).quickSimplify),
+        AssemblyLine.implied(TAY)
+      )
+    },
+
+    ((Elidable & HasOpcode(TSX)) ~
+      (Elidable & HasOpcode(INX)).*.captureLength(60) ~
+      (Elidable & HasOpcode(DEX) & DoesntMatterWhatItDoesWith(State.N, State.Z)).*.captureLength(61)).capture(3) ~
+      (Not(ConcernsX) & Not(ChangesS)).*.capture(2) ~
+      (Elidable & HasOpcode(LDY) & HasAddrMode(AbsoluteX) & MatchParameter(1) & DoesntMatterWhatItDoesWith(State.N, State.Z, State.A)) ~
+      Where(ctx => ctx.get[Constant](1) match {
+        case NumericConstant(x, _) => x >= 0x100 && x <= 0x1ff
+        case _ => false
+      }) ~~> { (code, ctx) =>
+      ctx.get[List[AssemblyLine]](2) ++ List(
+        AssemblyLine(LDA, Stack,(ctx.get[Constant](1) + ctx.get[Int](60) - ctx.get[Int](61)-0x100).quickSimplify),
+        AssemblyLine.implied(TAY)
+      ) ++ ctx.get[List[AssemblyLine]](3)
+    },
+  )
+
+  // TODO: rewrite most 8-bit optimizations that are applicable to 16-bit code
+
+  val AllForEmulation: List[AssemblyOptimization] = List(AccumulatorSwapping, OptimizeZeroIndex, RepSepWeakening, OptimizeStackRelative)
+
+  val AllForNative: List[AssemblyOptimization] = List(PointlessLoadAfterLoadOrStore)
+
+  val All: List[AssemblyOptimization] = AllForEmulation ++ AllForNative
+}
diff --git a/src/main/scala/millfork/assembly/opt/VariableToRegisterOptimization.scala b/src/main/scala/millfork/assembly/opt/VariableToRegisterOptimization.scala
index d179f72e..0b318393 100644
--- a/src/main/scala/millfork/assembly/opt/VariableToRegisterOptimization.scala
+++ b/src/main/scala/millfork/assembly/opt/VariableToRegisterOptimization.scala
@@ -15,49 +15,87 @@ import scala.collection.mutable.ListBuffer
   */
 object VariableToRegisterOptimization extends AssemblyOptimization {
 
+  case class Features(
+                     blastProcessing: Boolean,
+                     izIsAlwaysZero: Boolean,
+                     indexRegisterTransfers: Boolean,
+                     identityArray: Constant)
+
   // If any of these opcodes is present within a method,
   // then it's too hard to assign any variable to a register.
   private val opcodesThatAlwaysPrecludeXAllocation = Set(
     JSR, STX, TXA, INX, DEX, CPX,
+    LDX_W, STX_W, CPX_W, DEX_W, INX_W,
     PHX, PLX,
-    SBX, SAX, LXA, XAA, AHX, SHX, SHY, LAS, TAS)
+    SBX, SAX, LXA, XAA, AHX, SHX, SHY, LAS, TAS,
+    HuSAX, SXY, TXY, TXY,
+    SEP, REP,
+  )
 
   private val opcodesThatAlwaysPrecludeYAllocation = Set(
     JSR, STY, TYA, INY, DEY, CPY,
+    LDY_W, STY_W, CPY_W, DEY_W, INY_W,
     PHY, PLY,
-    AHX, SHX, SHY, LAS, TAS)
+    AHX, SHX, SHY, LAS, TAS,
+    SAY, SXY, TXY, TYX,
+    SEP, REP,
+  )
+
+  private val opcodesThatAlwaysPrecludeZAllocation = Set(
+    JSR, STZ, TZA, INZ, DEZ, CPZ,
+    PHZ, PLZ,
+    SEP, REP,
+  )
 
   private val opcodesThatAlwaysPrecludeAAllocation = Set(
     JSR, PHA, PLA,
-    ADC, SBC, ORA, EOR, AND,
+    ADC, SBC, ORA, EOR, AND, BIT,
+    ADC_W, SBC_W, ORA_W, EOR_W, AND_W, BIT_W,
     RRA, RLA, ISC, SLO, SRE,
     ALR, ARR, ANC, SBX, LXA, XAA,
-    AHX, SHX, SHY, LAS, TAS
+    AHX, SHX, SHY, LAS, TAS,
+    HuSAX, SAY,
+    TCD, TDC, TSC, TCS,
+    SEP, REP,
   )
 
   // If any of these opcodes is used on a variable
   // then it's too hard to assign that variable to a register.
   // Also, LDY prevents assigning a variable to X and LDX and LAX prevent assigning a variable to Y.
   private val opcodesThatCannotBeUsedWithIndexRegistersAsParameters = Set(
+    LDX_W, LDY_W, LDA_W,
     BIT, CPX, CPY, STY,
+    BIT_W, CPX_W, CPY_W, STY_W,
     EOR, ORA, AND, ADC, SBC, CMP,
+    EOR_W, ORA_W, AND_W, ADC_W, SBC_W, CMP_W,
     ROL, ROR, LSR, ASL, STX,
+    ROL_W, ROR_W, LSR_W, ASL_W, STX_W,
+    INC_W, DEC_W,
     SAX, SLO, SRE, ISC, DCP, RLA, RRA,
     AHX, SHY, SHX, LAS, TAS,
-    TRB, TSB)
+    TRB, TSB,
+    ASR,
+  )
 
   private val opcodesThatCannotBeUsedWithAccumulatorAsParameter = Set(
     BIT, CPX, CPY,
+    BIT_W, CPX_W, CPY_W,
     EOR, ORA, AND, ADC, SBC, CMP, STA,
+    EOR_W, ORA_W, AND_W, ADC_W, SBC_W, CMP_W, STA_W,
+    INC_W, DEC_W,
     SAX, SLO, SRE, ISC, DCP, RLA, RRA,
     AHX, SHY, SHX, LAS, TAS,
-    TRB, TSB)
+    TRB, TSB,
+    ASR,
+  )
 
   private val opcodesCommutative = Set(AND, ORA, EOR, ADC)
   private val opcodesIdentityTable = Set(AND, ORA, EOR, CMP, ADC, SBC)
 
   private val LdxAddrModes = Set(ZeroPage, Absolute, Immediate, AbsoluteY, ZeroPageY)
   private val LdyAddrModes = Set(ZeroPage, Absolute, Immediate, AbsoluteX, ZeroPageX)
+  private val LdzAddrModes = Set(Absolute, Immediate, AbsoluteX)
+  private val CpxyzAddrModes = Set(Absolute, Immediate, ZeroPage)
 
   override def name = "Allocating variables to index registers"
 
@@ -97,13 +135,20 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
     val importances = ReverseFlowAnalyzer.analyze(f, code)
     val blastProcessing = options.flag(CompilationFlag.OptimizeForSonicSpeed)
     val identityArray = f.environment.maybeGet[ThingInMemory]("identity$").map(MemoryAddressConstant).getOrElse(Constant.Zero)
+    val izIsAlwaysZero = !options.flag(CompilationFlag.Emit65CE02Opcodes)
+    val features = Features(
+      blastProcessing =options.flag(CompilationFlag.OptimizeForSonicSpeed),
+      izIsAlwaysZero = !options.flag(CompilationFlag.Emit65CE02Opcodes),
+      indexRegisterTransfers = options.flag(CompilationFlag.EmitEmulation65816Opcodes),
+      identityArray = f.environment.maybeGet[ThingInMemory]("identity$").map(MemoryAddressConstant).getOrElse(Constant.Zero)      
+    )
 
     val xCandidates = variablesWithLifetimes.filter {
       case (vName, range) =>
         importances(range.start).x != Important
     }.flatMap {
       case (vName, range) =>
-        canBeInlined(Some(vName), None, blastProcessing, code.zip(importances).slice(range.start, range.end)).map { score =>
+        canBeInlined(Some(vName), None, None, features, code.zip(importances).slice(range.start, range.end)).map { score =>
           (vName, range, if (variablesWithRegisterHint(vName)) score + 16 else score)
         }
     }
@@ -113,7 +158,17 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         importances(range.start).y != Important
     }.flatMap {
       case (vName, range) =>
-        canBeInlined(None, Some(vName), blastProcessing, code.zip(importances).slice(range.start, range.end)).map { score =>
+        canBeInlined(None, Some(vName), None, features, code.zip(importances).slice(range.start, range.end)).map { score =>
+          (vName, range, if (variablesWithRegisterHint(vName)) score + 16 else score)
+        }
+    }
+
+    val zCandidates = if (izIsAlwaysZero) Nil else variablesWithLifetimes.filter {
+      case (vName, range) =>
+        importances(range.start).iz != Important
+    }.flatMap {
+      case (vName, range) =>
+        canBeInlined(None, None, Some(vName), features, code.zip(importances).slice(range.start, range.end)).map { score =>
           (vName, range, if (variablesWithRegisterHint(vName)) score + 16 else score)
         }
     }
@@ -134,27 +189,33 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
     }
 //    println(s"X: $xCandidates")
 //    println(s"Y: $yCandidates")
+//    println(s"Z: $zCandidates")
 //    println(s"A: $aCandidates")
 
     val xCandidateSets = NonOverlappingIntervals.apply[(String, Range, Int)](xCandidates, _._2.start, _._2.end)
     val yCandidateSets = NonOverlappingIntervals.apply[(String, Range, Int)](yCandidates, _._2.start, _._2.end)
+    val zCandidateSets = NonOverlappingIntervals.apply[(String, Range, Int)](zCandidates, _._2.start, _._2.end)
     val aCandidateSets = NonOverlappingIntervals.apply[(String, Range, Int)](aCandidates, _._2.start, _._2.end)
 
     val variants = for {
       vx <- xCandidateSets.par
       vy <- yCandidateSets
+      vz <- zCandidateSets
       va <- aCandidateSets
+      if (vx & vz).isEmpty
+      if (vz & vy).isEmpty
+      if (va & vz).isEmpty
       if (vx & vy).isEmpty
       if (vx & va).isEmpty
       if (va & vy).isEmpty
-      score = vx.toSeq.map(_._3).sum + vy.toSeq.map(_._3).sum + va.toSeq.map(_._3).sum
-    } yield (score, vx, vy, va)
+      score = vx.toSeq.map(_._3).sum + vy.toSeq.map(_._3).sum + va.toSeq.map(_._3).sum + vz.toSeq.map(_._3).sum
+    } yield (score, vx, vy, vz, va)
 
     if (variants.isEmpty) {
       return code
     }
 
-    val (_, bestXs, bestYs, bestAs) = variants.maxBy(_._1)
+    val (_, bestXs, bestYs, bestZs, bestAs) = variants.maxBy(_._1)
 
     def reportOptimizedBlock[T](oldCode: List[(AssemblyLine, T)], newCode: List[AssemblyLine]): Unit = {
       oldCode.foreach(l => ErrorReporting.trace(l._1.toString))
@@ -162,9 +223,10 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
       newCode.foreach(l => ErrorReporting.trace(l.toString))
     }
 
-    if (bestXs.nonEmpty || bestYs.nonEmpty || bestAs.nonEmpty) {
+    if (bestXs.nonEmpty || bestYs.nonEmpty || bestZs.nonEmpty || bestAs.nonEmpty) {
       bestXs.foreach(v => f.environment.removeVariable(v._1))
       bestYs.foreach(v => f.environment.removeVariable(v._1))
+      bestZs.foreach(v => f.environment.removeVariable(v._1))
       bestAs.foreach(v => f.environment.removeVariable(v._1))
       val output = ListBuffer[AssemblyLine]()
       var i = 0
@@ -174,7 +236,7 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
           case (v, range, _) =>
             ErrorReporting.debug(s"Inlining $v to register X")
             val oldCode = code.zip(importances).slice(range.start, range.end)
-            val newCode = inlineVars(Some(v), None, None, identityArray, oldCode)
+            val newCode = inlineVars(Some(v), None, None, None, features, oldCode)
             reportOptimizedBlock(oldCode, newCode)
             output ++= newCode
             i = range.end
@@ -185,7 +247,19 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
             case (v, range, _) =>
               ErrorReporting.debug(s"Inlining $v to register Y")
               val oldCode = code.zip(importances).slice(range.start, range.end)
-              val newCode = inlineVars(None, Some(v), None, identityArray, oldCode)
+              val newCode = inlineVars(None, Some(v), None, None, features, oldCode)
+              reportOptimizedBlock(oldCode, newCode)
+              output ++= newCode
+              i = range.end
+              done = true
+          }
+        }
+        if (!done) {
+          bestZs.find(_._2.start == i).foreach {
+            case (v, range, _) =>
+              ErrorReporting.debug(s"Inlining $v to register Z")
+              val oldCode = code.zip(importances).slice(range.start, range.end)
+              val newCode = inlineVars(None, None, Some(v), None, features, oldCode)
               reportOptimizedBlock(oldCode, newCode)
               output ++= newCode
               i = range.end
@@ -197,7 +271,7 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
             case (v, range, _) =>
               ErrorReporting.debug(s"Inlining $v to register A")
               val oldCode = code.zip(importances).slice(range.start, range.end)
-              val newCode = inlineVars(None, None, Some(v), identityArray, oldCode)
+              val newCode = inlineVars(None, None, None, Some(v), features, oldCode)
               reportOptimizedBlock(oldCode, newCode)
               output ++= newCode
               i = range.end
@@ -215,28 +289,42 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
     }
   }
 
-
-  def canBeInlined(xCandidate: Option[String], yCandidate: Option[String], blastProcessing: Boolean, lines: List[(AssemblyLine, CpuImportance)]): Option[Int] = {
+  // TODO: STA has different flag behaviour than TAX, keep it in mind!
+  def canBeInlined(xCandidate: Option[String], yCandidate: Option[String], zCandidate: Option[String], features: Features, lines: List[(AssemblyLine, CpuImportance)]): Option[Int] = {
     val vx = xCandidate.getOrElse("-")
     val vy = yCandidate.getOrElse("-")
+    val vz = zCandidate.getOrElse("-")
     lines match {
       case (AssemblyLine(_, Immediate, SubbyteConstant(MemoryAddressConstant(th), _), _), _) :: xs
-        if th.name == vx || th.name == vy =>
+        if th.name == vx || th.name == vy || th.name == vz =>
         // if an address of a variable is used, then that variable cannot be assigned to a register
         None
       case (AssemblyLine(_, Immediate, HalfWordConstant(MemoryAddressConstant(th), _), _), _) :: xs
-        if th.name == vx || th.name == vy =>
+        if th.name == vx || th.name == vy || th.name == vz =>
         // if an address of a variable is used, then that variable cannot be assigned to a register
         None
 
-      case (AssemblyLine(_, AbsoluteX | AbsoluteY | ZeroPageX | ZeroPageY | IndexedY | IndexedX | ZeroPageIndirect | Indirect | AbsoluteIndexedX, MemoryAddressConstant(th), _), _) :: xs =>
+      case (AssemblyLine(_, AbsoluteX | AbsoluteY | LongAbsoluteX |
+                            ZeroPageX | ZeroPageY |
+                            IndexedY | IndexedX | IndexedZ |
+                            LongIndexedY | LongIndexedZ |
+                            Indirect | LongIndirect |
+                            AbsoluteIndexedX, MemoryAddressConstant(th), _), _) :: xs =>
         // if a variable is used as an array or a pointer, then it cannot be assigned to a register
-        if (th.name == vx || th.name == vy) {
+        if (th.name == vx || th.name == vy || th.name == vz) {
           None
         } else {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
         }
 
+      case (AssemblyLine(STY | LDY, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs if th.name == vx =>
+        if (features.indexRegisterTransfers) canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
+        else None
+
+      case (AssemblyLine(STX | LDX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs if th.name == vy =>
+        if (features.indexRegisterTransfers) canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
+        else None
+
       case (AssemblyLine(opcode, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx && (opcode == LDY || opcodesThatCannotBeUsedWithIndexRegistersAsParameters(opcode)) =>
         // if a variable is used by some opcodes, then it cannot be assigned to a register
@@ -247,15 +335,20 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // if a variable is used by some opcodes, then it cannot be assigned to a register
         None
 
+      case (AssemblyLine(opcode, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vz && (opcode == LDZ || opcodesThatCannotBeUsedWithIndexRegistersAsParameters(opcode)) =>
+        // if a variable is used by some opcodes, then it cannot be assigned to a register
+        None
+
       case (AssemblyLine(LDX, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), imp) :: xs
         if xCandidate.isDefined =>
         // if a register is populated with a different variable, then this variable cannot be assigned to that register
         // removing LDX saves 3 cycles
         if (elidable && th.name == vx) {
           if (imp.z == Unimportant && imp.n == Unimportant) {
-            canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 3)
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
           } else {
-            canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 1)
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 1)
           }
         } else {
           None
@@ -266,7 +359,7 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // LAX = LDX-LDA, and since LDX simplifies to nothing and LDA simplifies to TXA,
         // LAX simplifies to TXA, saving two bytes
         if (elidable && th.name == vx) {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 2)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
         } else {
           None
         }
@@ -277,9 +370,20 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // sometimes that LDX has to be converted into CPX#0
         if (elidable && th.name == vy) {
           if (imp.z == Unimportant && imp.n == Unimportant) {
-            canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 3)
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
           } else {
-            canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 1)
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 1)
+          }
+        } else {
+          None
+        }
+
+      case (AssemblyLine(LDZ, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), imp) :: xs if zCandidate.isDefined =>
+        if (elidable && th.name == vz) {
+          if (imp.z == Unimportant && imp.n == Unimportant) {
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
+          } else {
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 1)
           }
         } else {
           None
@@ -293,33 +397,46 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // if a register is populated with something else than a variable, then no variable cannot be assigned to that register
         None
 
+      case (AssemblyLine(LDZ, _, _, _), _) :: xs if zCandidate.isDefined =>
+        // if a register is populated with something else than a variable, then no variable cannot be assigned to that register
+        None
+
       case (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), elidable),_) :: xs
         if opcodesIdentityTable(op) =>
         if (th.name == vx || th.name == vy) {
-          if (elidable) canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+          if (elidable) canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
           else None
-        } else canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+        } else {
+          if (th.name == vz) None
+          else canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
+        }
 
       case (AssemblyLine(LDA, _, _, elidable),_) :: (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), elidable2),_) :: xs
         if opcodesCommutative(op) =>
         if (th.name == vx || th.name == vy) {
-          if (elidable && elidable2) canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 2)
+          if (elidable && elidable2) canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
           else None
-        } else canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+        } else {
+          if (th.name == vz) None
+          else canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
+        }
 
       case (AssemblyLine(LDA, _, _, elidable),_) :: (AssemblyLine(CLC, _, _, _),_) :: (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), elidable2),_) :: xs
         if opcodesCommutative(op) =>
         if (th.name == vx || th.name == vy) {
-          if (elidable && elidable2) canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 2)
+          if (elidable && elidable2) canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
           else None
-        } else canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+        } else {
+          if (th.name == vz) None
+          else canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
+        }
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), _) :: (AssemblyLine(TAX, _, _, elidable2), _) :: xs
         if xCandidate.isDefined =>
         // a variable cannot be inlined if there is TAX not after LDA of that variable
         // but LDA-TAX can be simplified to TXA
         if (elidable && elidable2 && th.name == vx) {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 3)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
         } else {
           None
         }
@@ -329,21 +446,43 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // a variable cannot be inlined if there is TAY not after LDA of that variable
         // but LDA-TAY can be simplified to TYA
         if (elidable && elidable2 && th.name == vy) {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 3)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
         } else {
           None
         }
 
-      case (AssemblyLine(LDA | STA | INC | DEC | STZ, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), _) :: xs =>
-        // changing LDA->TXA, STA->TAX, INC->INX, DEC->DEX, STZ->LDA saves 2 bytes
-        if (th.name == vy || th.name == vx) {
+      case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), _) :: (AssemblyLine(TAZ, _, _, elidable2), _) :: xs
+        if zCandidate.isDefined =>
+        // a variable cannot be inlined if there is TAZ not after LDA of that variable
+        // but LDA-TAZ can be simplified to TZA
+        if (elidable && elidable2 && th.name == vy) {
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 3)
+        } else {
+          None
+        }
+
+      case (AssemblyLine(LDA | STA | INC | DEC, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), _) :: xs =>
+        // changing LDA->TXA, STA->TAX, INC->INX, DEC->DEX saves 2 bytes
+        if (th.name == vy || th.name == vx || th.name == vz) {
           if (elidable) {
-            canBeInlined(xCandidate, yCandidate, blastProcessing, xs).map(_ + 2)
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
           } else {
             None
           }
         } else {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
+        }
+
+      case (AssemblyLine(STZ, Absolute | ZeroPage, MemoryAddressConstant(th), elidable), _) :: xs =>
+        // changing STZ->LDX saves 2 bytes
+        if (th.name == vy || th.name == vx) {
+          if (elidable && features.izIsAlwaysZero) {
+            canBeInlined(xCandidate, yCandidate, zCandidate, features, xs).map(_ + 2)
+          } else {
+            None
+          }
+        } else {
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
         }
 
       case (AssemblyLine(TAX, _, _, _), _) :: xs if xCandidate.isDefined =>
@@ -354,17 +493,23 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // a variable cannot be inlined if there is TAY not after LDA of that variable
         None
 
+      case (AssemblyLine(TAZ, _, _, _), _) :: xs if zCandidate.isDefined =>
+        // a variable cannot be inlined if there is TAZ not after LDA of that variable
+        None
+
       case (AssemblyLine(LABEL, _, _, _), _) :: xs =>
         // labels always end the initial section
-        canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+        canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
 
       case (x, _) :: xs =>
         if (xCandidate.isDefined && opcodesThatAlwaysPrecludeXAllocation(x.opcode)) {
           None
         } else if (yCandidate.isDefined && opcodesThatAlwaysPrecludeYAllocation(x.opcode)) {
           None
+        } else if (zCandidate.isDefined && opcodesThatAlwaysPrecludeZAllocation(x.opcode)) {
+          None
         } else {
-          canBeInlined(xCandidate, yCandidate, blastProcessing, xs)
+          canBeInlined(xCandidate, yCandidate, zCandidate, features, xs)
         }
 
       case Nil => Some(0)
@@ -397,7 +542,7 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
         // if an address of a variable is used, then that variable cannot be assigned to a register
         None
 
-      case (AssemblyLine(_, AbsoluteX | AbsoluteY | ZeroPageX | ZeroPageY | IndexedY | IndexedX | ZeroPageIndirect | Indirect | AbsoluteIndexedX, MemoryAddressConstant(th), _),_) :: xs
+      case (AssemblyLine(_, AbsoluteX | AbsoluteY | ZeroPageX | ZeroPageY | IndexedY | IndexedX | IndexedZ | Indirect | AbsoluteIndexedX, MemoryAddressConstant(th), _),_) :: xs
         if th.name == candidate =>
         // if a variable is used as an array or a pointer, then it cannot be assigned to a register
         None
@@ -508,178 +653,237 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
     }
   }
 
-  def inlineVars(xCandidate: Option[String], yCandidate: Option[String], aCandidate: Option[String], identityArray: Constant, lines: List[(AssemblyLine, CpuImportance)]): List[AssemblyLine] = {
+  def inlineVars(xCandidate: Option[String], yCandidate: Option[String], zCandidate: Option[String], aCandidate: Option[String], features: Features, lines: List[(AssemblyLine, CpuImportance)]): List[AssemblyLine] = {
     val vx = xCandidate.getOrElse("-")
     val vy = yCandidate.getOrElse("-")
+    val vz = zCandidate.getOrElse("-")
     val va = aCandidate.getOrElse("-")
     lines match {
       case (AssemblyLine(INC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.implied(INX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(INX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(INC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vy =>
-        AssemblyLine.implied(INY) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(INY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(INC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vz =>
+        AssemblyLine.implied(INZ) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(DEC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.implied(DEX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(DEX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(DEC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vy =>
-        AssemblyLine.implied(DEY) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(DEY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(DEC, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vz =>
+        AssemblyLine.implied(DEZ) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(opcode@(DEC | INC | ROL | ROR | ASL | LSR), Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(opcode) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(opcode) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDX, Absolute | ZeroPage, MemoryAddressConstant(th), _), imp) :: xs
         if th.name == vx =>
         if (imp.z == Unimportant && imp.n == Unimportant) {
-          inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         } else {
-          AssemblyLine.immediate(CPX, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          AssemblyLine.immediate(CPX, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         }
 
       case (AssemblyLine(LAX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesIdentityTable(op) && th.name == vx =>
-        l.copy(addrMode = AbsoluteX, parameter = identityArray) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        l.copy(addrMode = AbsoluteX, parameter = features.identityArray) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesIdentityTable(op) && th.name == vy =>
-        l.copy(addrMode = AbsoluteY, parameter = identityArray) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        l.copy(addrMode = AbsoluteY, parameter = features.identityArray) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) ::  (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == va =>
-        l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) :: (clc@AssemblyLine(CLC, _, _, _), _) :: (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == va =>
-        l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) ::  (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == vx =>
-        AssemblyLine.implied(TXA) :: l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) :: (clc@AssemblyLine(CLC, _, _, _), _) :: (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == vx =>
-        AssemblyLine.implied(TXA) :: l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) ::  (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == vy =>
-        AssemblyLine.implied(TYA) :: l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: l.copy(opcode = op) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (l@AssemblyLine(LDA, _, _, _), _) :: (clc@AssemblyLine(CLC, _, _, _), _) :: (AssemblyLine(op, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if opcodesCommutative(op) && th.name == vy =>
-        AssemblyLine.implied(TYA) :: l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: l.copy(opcode = op) :: clc :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA | STA, Absolute | ZeroPage, MemoryAddressConstant(th), _), imp) :: xs
         if th.name == va =>
         if (imp.z == Unimportant && imp.n == Unimportant) {
-          inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         } else {
-          AssemblyLine.immediate(CMP, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          AssemblyLine.immediate(CMP, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         }
 
       case (AssemblyLine(LAX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDY, Absolute | ZeroPage, MemoryAddressConstant(th), _), imp) :: xs
         if th.name == vy =>
         if (imp.z == Unimportant && imp.n == Unimportant) {
-          inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         } else {
-          AssemblyLine.immediate(CPY, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+          AssemblyLine.immediate(CPY, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+        }
+
+      case (AssemblyLine(LDZ, Absolute | ZeroPage, MemoryAddressConstant(th), _), imp) :: xs
+        if th.name == vz =>
+        if (imp.z == Unimportant && imp.n == Unimportant) {
+          inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+        } else {
+          AssemblyLine.immediate(CPZ, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
         }
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: (AssemblyLine(TAX, _, _, true), _) :: xs
         if th.name == vx =>
         // these TXA's may get optimized away by a different optimization
-        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: (AssemblyLine(TAY, _, _, true), _) :: xs
         if th.name == vy =>
         // these TYA's may get optimized away by a different optimization
-        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: (AssemblyLine(TAZ, _, _, true), _) :: xs
+        if th.name == vz =>
+        // these TZA's may get optimized away by a different optimization
+        AssemblyLine.implied(TZA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDX, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: (AssemblyLine(TXA, _, _, true), _) :: xs
         if th.name == va =>
         // these TAX's may get optimized away by a different optimization
-        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDY, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: (AssemblyLine(TYA, _, _, true), _) :: xs
         if th.name == va =>
         // these TAY's may get optimized away by a different optimization
-        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, am, param, true), _) :: (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: xs
         if th.name == vx && LdxAddrModes(am) =>
         // these TXA's may get optimized away by a different optimization
-        AssemblyLine(LDX, am, param) :: AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine(LDX, am, param) :: AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, am, param, true), _) :: (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: xs
         if th.name == vy && LdyAddrModes(am) =>
         // these TYA's may get optimized away by a different optimization
-        AssemblyLine(LDY, am, param) :: AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine(LDY, am, param) :: AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDA, am, param, true), _) :: (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), true), _) :: xs
+        if th.name == vz && LdzAddrModes(am) =>
+        // these TZA's may get optimized away by a different optimization
+        AssemblyLine(LDZ, am, param) :: AssemblyLine.implied(TZA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: (AssemblyLine(CMP, am, param, true), _) :: xs
-        if th.name == vx && doesntUseXOrY(am) =>
+        if th.name == vx && CpxyzAddrModes(am) =>
         // ditto
-        AssemblyLine.implied(TXA) :: AssemblyLine(CPX, am, param) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: AssemblyLine(CPX, am, param) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: (AssemblyLine(CMP, am, param, true), _) :: xs
-        if th.name == vy && doesntUseXOrY(am) =>
+        if th.name == vy && CpxyzAddrModes(am) =>
         // ditto
-        AssemblyLine.implied(TYA) :: AssemblyLine(CPY, am, param) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: AssemblyLine(CPY, am, param) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: (AssemblyLine(CMP, am, param, true), _) :: xs
+        if th.name == vy && CpxyzAddrModes(am) =>
+        // ditto
+        AssemblyLine.implied(TZA) :: AssemblyLine(CPZ, am, param) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vy =>
-        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDY, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vx =>
+        AssemblyLine.implied(TXY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vy =>
+        AssemblyLine.implied(TYX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(LDA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vz =>
+        AssemblyLine.implied(TZA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(LDY, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vy =>
-        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TAY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(STA, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vz =>
+        AssemblyLine.implied(TAZ) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(STX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TXA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(STY, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.implied(TYA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(STX, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vy =>
+        AssemblyLine.implied(TXY) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (AssemblyLine(STY, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
+        if th.name == vx =>
+        AssemblyLine.implied(TYX) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(STZ, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vx =>
-        AssemblyLine.immediate(LDX, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        if (features.izIsAlwaysZero) AssemblyLine.immediate(LDX, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+        else  ErrorReporting.fatal("Unexpected STZ")
 
       case (AssemblyLine(STZ, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == vy =>
-        AssemblyLine.immediate(LDY, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        if (features.izIsAlwaysZero) AssemblyLine.immediate(LDY, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+        else  ErrorReporting.fatal("Unexpected STZ")
 
       case (AssemblyLine(STZ, Absolute | ZeroPage, MemoryAddressConstant(th), _), _) :: xs
         if th.name == va =>
-        AssemblyLine.immediate(LDA, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        if (features.izIsAlwaysZero) AssemblyLine.immediate(LDA, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+        else AssemblyLine.implied(TZA) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(TAX, _, _, _), _) :: xs if xCandidate.isDefined =>
         ErrorReporting.fatal("Unexpected TAX")
@@ -687,20 +891,26 @@ object VariableToRegisterOptimization extends AssemblyOptimization {
       case (AssemblyLine(TAY, _, _, _), _) :: xs if yCandidate.isDefined =>
         ErrorReporting.fatal("Unexpected TAY")
 
+      case (AssemblyLine(TAZ, _, _, _), _) :: xs if zCandidate.isDefined =>
+        ErrorReporting.fatal("Unexpected TAZ")
+
       case (AssemblyLine(TXA, _, _, _), _) :: xs if aCandidate.isDefined =>
-        AssemblyLine.immediate(CPX, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.immediate(CPX, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case (AssemblyLine(TYA, _, _, _), _) :: xs if aCandidate.isDefined =>
-        AssemblyLine.immediate(CPY, 0) :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+        AssemblyLine.immediate(CPY, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
-      case (x, _) :: xs => x :: inlineVars(xCandidate, yCandidate, aCandidate, identityArray, xs)
+      case (AssemblyLine(TZA, _, _, _), _) :: xs if aCandidate.isDefined =>
+        AssemblyLine.immediate(CPZ, 0) :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
+
+      case (x, _) :: xs => x :: inlineVars(xCandidate, yCandidate, zCandidate, aCandidate, features, xs)
 
       case Nil => Nil
     }
   }
 
   def doesntUseXOrY(am: AddrMode.Value): Boolean = am match {
-    case Immediate | ZeroPage | Absolute | Relative | Indirect | ZeroPageIndirect => true
+    case Immediate | WordImmediate | ZeroPage | Absolute | LongAbsolute | Relative | LongRelative | Indirect | LongIndirect | Stack | IndexedZ => true
     case _ => false
   }
 }
diff --git a/src/main/scala/millfork/compiler/BuiltIns.scala b/src/main/scala/millfork/compiler/BuiltIns.scala
index 8f6a7c58..25667dbe 100644
--- a/src/main/scala/millfork/compiler/BuiltIns.scala
+++ b/src/main/scala/millfork/compiler/BuiltIns.scala
@@ -258,6 +258,14 @@ object BuiltIns {
       case Some(NumericConstant(0, _)) =>
         Nil
       case Some(NumericConstant(shift, _)) if shift > 0 =>
+        if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, a1, l, _)), List(AssemblyLine(STA, a2, h, _))) =>
+              if (a1 == a2 && l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.accu16) ++ List.fill(shift.toInt)(AssemblyLine(if (aslRatherThanLsr) ASL_W else LSR_W, a1, l)) ++ List(AssemblyLine.accu8)
+              }
+          }
+        }
         List.fill(shift.toInt)(if (aslRatherThanLsr) {
           staTo(ASL, lo) ++ targetBytes.tail.flatMap { b => staTo(ROL, b) }
         } else {
@@ -643,15 +651,79 @@ object BuiltIns {
       case Some(NumericConstant(0, _)) =>
         return Nil
       case Some(NumericConstant(1, _)) if canUseIncDec && !subtract =>
+        if (ctx.options.flags(CompilationFlag.Emit65CE02Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, ZeroPage, l, _)), List(AssemblyLine(STA, ZeroPage, h, _))) =>
+              if (l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.zeropage(INC_W, l))
+              }
+          }
+        }
+        if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, a1@(ZeroPage | Absolute | ZeroPageX | AbsoluteX), l, _)), List(AssemblyLine(STA, a2, h, _))) =>
+              if (a1 == a2 && l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.accu16, AssemblyLine(INC_W, a1, l), AssemblyLine.accu8)
+              }
+          }
+        }
         val label = MfCompiler.nextLabel("in")
         return staTo(INC, targetBytes.head) ++ targetBytes.tail.flatMap(l => AssemblyLine.relative(BNE, label)::staTo(INC, l)) :+ AssemblyLine.label(label)
       case Some(NumericConstant(-1, _)) if canUseIncDec && subtract =>
+        if (ctx.options.flags(CompilationFlag.Emit65CE02Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, ZeroPage, l, _)), List(AssemblyLine(STA, ZeroPage, h, _))) =>
+              if (l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.zeropage(INC_W, l))
+              }
+          }
+        }
+        if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, a1@(ZeroPage | Absolute | ZeroPageX | AbsoluteX), l, _)), List(AssemblyLine(STA, a2, h, _))) =>
+              if (a1 == a2 && l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.accu16, AssemblyLine(INC_W, a1, l), AssemblyLine.accu8)
+              }
+          }
+        }
         val label = MfCompiler.nextLabel("in")
         return staTo(INC, targetBytes.head) ++ targetBytes.tail.flatMap(l => AssemblyLine.relative(BNE, label)::staTo(INC, l)) :+ AssemblyLine.label(label)
       case Some(NumericConstant(1, _)) if canUseIncDec && subtract =>
+        if (ctx.options.flags(CompilationFlag.Emit65CE02Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, ZeroPage, l, _)), List(AssemblyLine(STA, ZeroPage, h, _))) =>
+              if (l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.zeropage(DEC_W, l))
+              }
+          }
+        }
+        if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, a1@(ZeroPage | Absolute | ZeroPageX | AbsoluteX), l, _)), List(AssemblyLine(STA, a2, h, _))) =>
+              if (a1 == a2 && l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.accu16, AssemblyLine(DEC_W, a1, l), AssemblyLine.accu8)
+              }
+          }
+        }
         val label = MfCompiler.nextLabel("de")
         return doDec(targetBytes)
       case Some(NumericConstant(-1, _)) if canUseIncDec && !subtract =>
+        if (ctx.options.flags(CompilationFlag.Emit65CE02Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, ZeroPage, l, _)), List(AssemblyLine(STA, ZeroPage, h, _))) =>
+              if (l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.zeropage(DEC_W, l))
+              }
+          }
+        }
+        if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+          targetBytes match {
+            case List(List(AssemblyLine(STA, a1@(ZeroPage | Absolute | ZeroPageX | AbsoluteX), l, _)), List(AssemblyLine(STA, a2, h, _))) =>
+              if (a1 == a2 && l.+(1).quickSimplify == h) {
+                return List(AssemblyLine.accu16, AssemblyLine(DEC_W, a1, l), AssemblyLine.accu8)
+              }
+          }
+        }
         val label = MfCompiler.nextLabel("de")
         return doDec(targetBytes)
       case Some(constant) =>
@@ -719,6 +791,32 @@ object BuiltIns {
         }
     }
     val addendByteRead = addendByteRead0 ++ List.fill((targetSize - addendByteRead0.size) max 0)(List(AssemblyLine.immediate(LDA, 0)))
+
+    if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+      (removeTsx(targetBytes), removeTsx(addendByteRead)) match {
+        case (List(List(AssemblyLine(STA, ta1, tl, _)), List(AssemblyLine(STA, ta2, th, _))), List(List(AssemblyLine(LDA, Immediate, al, _)), List(AssemblyLine(LDA, Immediate, ah, _)))) =>
+          if (ta1 == ta2 && tl.+(1).quickSimplify == th) {
+            return wrapInSedCldIfNeeded(decimal, List(
+              AssemblyLine.implied(if(subtract) SEC else CLC),
+              AssemblyLine.accu16,
+              AssemblyLine(LDA_W, ta1, tl),
+              AssemblyLine(if(subtract) SBC_W else ADC_W, WordImmediate, ah.asl(8).+(al).quickSimplify),
+              AssemblyLine(STA_W, ta1, tl),
+              AssemblyLine.accu8))
+          }
+        case (List(List(AssemblyLine(STA, ta1, tl, _)), List(AssemblyLine(STA, ta2, th, _))), List(List(AssemblyLine(LDA, aa1, al, _)), List(AssemblyLine(LDA, aa2, ah, _)))) =>
+          if (ta1 == ta2 && aa1 == aa2 && tl.+(1).quickSimplify == th && al.+(1).quickSimplify == ah) {
+            return wrapInSedCldIfNeeded(decimal, List(
+              AssemblyLine.accu16,
+              AssemblyLine.implied(if(subtract) SEC else CLC),
+              AssemblyLine(LDA_W, ta1, tl),
+              AssemblyLine(if(subtract) SBC_W else ADC_W, aa1, al),
+              AssemblyLine(STA_W, ta1, tl),
+              AssemblyLine.accu8))
+          }
+        case _ =>
+      }
+    }
     val buffer = mutable.ListBuffer[AssemblyLine]()
     buffer ++= calculateRhs
     buffer += AssemblyLine.implied(if (subtract) SEC else CLC)
@@ -819,6 +917,29 @@ object BuiltIns {
           })
         }
     }
+    if (ctx.options.flags(CompilationFlag.EmitNative65816Opcodes)) {
+      (removeTsx(targetBytes), removeTsx(addendByteRead)) match {
+        case (List(List(AssemblyLine(STA, ta1, tl, _)), List(AssemblyLine(STA, ta2, th, _))), List(List(AssemblyLine(LDA, Immediate, al, _)), List(AssemblyLine(LDA, Immediate, ah, _)))) =>
+          if (ta1 == ta2 && tl.+(1).quickSimplify == th) {
+            return List(
+              AssemblyLine.accu16,
+              AssemblyLine(LDA_W, ta1, tl),
+              AssemblyLine(Opcode.widen(operation).get, WordImmediate, ah.asl(8).+(al).quickSimplify),
+              AssemblyLine(STA_W, ta1, tl),
+              AssemblyLine.accu8)
+          }
+        case (List(List(AssemblyLine(STA, ta1, tl, _)), List(AssemblyLine(STA, ta2, th, _))), List(List(AssemblyLine(LDA, aa1, al, _)), List(AssemblyLine(LDA, aa2, ah, _)))) =>
+          if (ta1 == ta2 && aa1 == aa2 && tl.+(1).quickSimplify == th && al.+(1).quickSimplify == ah) {
+            return List(
+              AssemblyLine.accu16,
+              AssemblyLine(LDA_W, ta1, tl),
+              AssemblyLine(Opcode.widen(operation).get, aa1, al),
+              AssemblyLine(STA_W, ta1, tl),
+              AssemblyLine.accu8)
+          }
+        case _ =>
+      }
+    }
     val AllOnes = (1L << (8 * targetSize)) - 1
     (operation, env.eval(param)) match {
       case (EOR, Some(NumericConstant(0, _)))
@@ -880,4 +1001,10 @@ object BuiltIns {
         ???
     }
   }
+
+  private def removeTsx(codes: List[List[AssemblyLine]]): List[List[AssemblyLine]] = codes.map {
+    case List(AssemblyLine(TSX, _, _, _), AssemblyLine(op, AbsoluteX, NumericConstant(nn, _), _)) if nn >= 0x100 && nn <= 0x1ff =>
+      List(AssemblyLine(op, Stack, NumericConstant(nn & 0xff, 1)))
+    case x => x
+  }
 }
diff --git a/src/main/scala/millfork/compiler/ExpressionCompiler.scala b/src/main/scala/millfork/compiler/ExpressionCompiler.scala
index 9cd6444c..b3a27a82 100644
--- a/src/main/scala/millfork/compiler/ExpressionCompiler.scala
+++ b/src/main/scala/millfork/compiler/ExpressionCompiler.scala
@@ -87,6 +87,11 @@ object ExpressionCompiler {
   def compileConstant(ctx: CompilationContext, expr: Constant, target: Variable): List[AssemblyLine] = {
     target match {
       case RegisterVariable(Register.A, _) => List(AssemblyLine(LDA, Immediate, expr))
+      case RegisterVariable(Register.AW, _) =>
+        List(
+          AssemblyLine.accu16,
+          AssemblyLine(LDA_W, WordImmediate, expr),
+          AssemblyLine.accu8)
       case RegisterVariable(Register.X, _) => List(AssemblyLine(LDX, Immediate, expr))
       case RegisterVariable(Register.Y, _) => List(AssemblyLine(LDY, Immediate, expr))
       case RegisterVariable(Register.AX, _) => List(
@@ -221,7 +226,12 @@ object ExpressionCompiler {
           case 1 =>
             v match {
               case mv: VariableInMemory => AssemblyLine.variable(ctx, store, mv)
-              case sv@StackVariable(_, _, offset) => AssemblyLine.implied(transferToA) :: AssemblyLine.implied(TSX) :: AssemblyLine.absoluteX(STA, offset + ctx.extraStackOffset) :: Nil
+              case sv@StackVariable(_, _, offset) =>
+                if (ctx.options.flags(CompilationFlag.EmitEmulation65816Opcodes)) {
+                  AssemblyLine.implied(transferToA) :: AssemblyLine.stackRelative(STA, offset + ctx.extraStackOffset) :: Nil
+                } else {
+                  AssemblyLine.implied(transferToA) :: AssemblyLine.implied(TSX) :: AssemblyLine.absoluteX(STA, offset + ctx.extraStackOffset) :: Nil
+                }
             }
           case s if s > 1 =>
             v match {
@@ -373,6 +383,18 @@ object ExpressionCompiler {
               case source: VariableInMemory =>
                 target match {
                   case RegisterVariable(Register.A, _) => AssemblyLine.variable(ctx, LDA, source)
+                  case RegisterVariable(Register.AW, _) =>
+                    exprType.size match {
+                      case 1 => if (exprType.isSigned) {
+                        AssemblyLine.variable(ctx, LDA, source) ++ List(
+                          AssemblyLine.implied(PHA)) ++ signExtendA() ++ List(
+                          AssemblyLine.implied(XBA),
+                          AssemblyLine.implied(PLA))
+                      } else List(AssemblyLine.immediate(LDX, 0), AssemblyLine.implied(XBA)) ++ AssemblyLine.variable(ctx, LDA, source) :+ AssemblyLine.immediate(LDX, 0)
+                      case 2 =>
+                        // TODO: use LDA_W
+                        AssemblyLine.variable(ctx, LDA, source, 1) ++ List(AssemblyLine.implied(XBA)) ++ AssemblyLine.variable(ctx, LDA, source)
+                    }
                   case RegisterVariable(Register.X, _) => AssemblyLine.variable(ctx, LDX, source)
                   case RegisterVariable(Register.Y, _) => AssemblyLine.variable(ctx, LDY, source)
                   case RegisterVariable(Register.AX, _) =>
@@ -1021,13 +1043,13 @@ object ExpressionCompiler {
                       // TODO: fix
                       case _ => Nil
                     }
-                    secondViaMemory ++ thirdViaRegisters :+ AssemblyLine.absolute(JSR, function)
+                    secondViaMemory ++ thirdViaRegisters :+ AssemblyLine.absoluteOrLongAbsolute(JSR, function, ctx.options)
                   case NormalParamSignature(paramVars) =>
                     params.zip(paramVars).flatMap {
                       case (paramExpr, paramVar) =>
                         val callCtx = callingContext(ctx, paramVar)
                         compileAssignment(callCtx, paramExpr, VariableExpression(paramVar.name))
-                    } ++ List(AssemblyLine.absolute(JSR, function))
+                    } ++ List(AssemblyLine.absoluteOrLongAbsolute(JSR, function, ctx.options))
                 }
                 result
             }
@@ -1076,6 +1098,7 @@ object ExpressionCompiler {
     exprTypeAndVariable.fold(noop) {
       case (VoidType, _) => ???
       case (_, RegisterVariable(Register.A, _)) => noop
+      case (_, RegisterVariable(Register.AW, _)) => List(AssemblyLine.implied(XBA), AssemblyLine.implied(TAX), AssemblyLine.implied(XBA))
       case (_, RegisterVariable(Register.X, _)) => List(AssemblyLine.implied(TAX))
       case (_, RegisterVariable(Register.Y, _)) => List(AssemblyLine.implied(TAY))
       case (_, RegisterVariable(Register.AX, _)) =>
@@ -1083,7 +1106,9 @@ object ExpressionCompiler {
         noop
       case (_, RegisterVariable(Register.XA, _)) =>
         // TODO: sign extension
-        if (ctx.options.flag(CompilationFlag.EmitCmosOpcodes)) {
+        if (ctx.options.flag(CompilationFlag.EmitHudsonOpcodes)) {
+          List(AssemblyLine.implied(HuSAX))
+        } else if (ctx.options.flag(CompilationFlag.EmitCmosOpcodes)) {
           List(
             AssemblyLine.implied(PHA),
             AssemblyLine.implied(PHX),
@@ -1105,11 +1130,17 @@ object ExpressionCompiler {
           AssemblyLine.implied(TXA))
       case (_, RegisterVariable(Register.AY, _)) =>
         // TODO: sign extension
-        List(
-          AssemblyLine.implied(PHA),
-          AssemblyLine.implied(TXA),
-          AssemblyLine.implied(TAY),
-          AssemblyLine.implied(PLA))
+        if (ctx.options.flag(CompilationFlag.EmitHudsonOpcodes)) {
+          List(AssemblyLine.implied(SXY))
+        } else if (ctx.options.flag(CompilationFlag.EmitEmulation65816Opcodes)) {
+          List(AssemblyLine.implied(TXY))
+        } else {
+          List(
+            AssemblyLine.implied(PHA),
+            AssemblyLine.implied(TXA),
+            AssemblyLine.implied(TAY),
+            AssemblyLine.implied(PLA))
+        }
       case (t, v: VariableInMemory) => t.size match {
         case 1 => v.typ.size match {
           case 1 =>
@@ -1230,7 +1261,7 @@ object ExpressionCompiler {
           if (i < arrayLength) return Nil
           if (i >= arrayLength) return List(
             AssemblyLine.implied(PHP),
-            AssemblyLine.absolute(JSR, ctx.env.get[ThingInMemory]("_panic")))
+            AssemblyLine.absoluteOrLongAbsolute(JSR, ctx.env.get[ThingInMemory]("_panic"), ctx.options))
         }
       case _ =>
     }
@@ -1245,7 +1276,7 @@ object ExpressionCompiler {
         AssemblyLine.implied(PHP),
         AssemblyLine.immediate(compare, arrayLength),
         AssemblyLine.relative(BCC, label),
-        AssemblyLine.absolute(JSR, ctx.env.get[ThingInMemory]("_panic")),
+        AssemblyLine.absoluteOrLongAbsolute(JSR, ctx.env.get[ThingInMemory]("_panic"), ctx.options),
         AssemblyLine.label(label),
         AssemblyLine.implied(PLP))
     } else {
diff --git a/src/main/scala/millfork/compiler/StatementCompiler.scala b/src/main/scala/millfork/compiler/StatementCompiler.scala
index f1ed947e..22c637ca 100644
--- a/src/main/scala/millfork/compiler/StatementCompiler.scala
+++ b/src/main/scala/millfork/compiler/StatementCompiler.scala
@@ -69,7 +69,7 @@ object StatementCompiler {
         val actualAddrMode = a match {
           case Absolute if OpcodeClasses.ShortBranching(o) => Relative
           case IndexedX if o == JMP => AbsoluteIndexedX
-          case Indirect if o != JMP => ZeroPageIndirect
+          case Indirect if o != JMP => IndexedZ
           case _ => a
         }
         List(AssemblyLine(o, actualAddrMode, c, e))
diff --git a/src/main/scala/millfork/env/Thing.scala b/src/main/scala/millfork/env/Thing.scala
index f08e308f..70297fdc 100644
--- a/src/main/scala/millfork/env/Thing.scala
+++ b/src/main/scala/millfork/env/Thing.scala
@@ -1,7 +1,7 @@
 package millfork.env
 
+import millfork.{CompilationFlag, CompilationOptions}
 import millfork.assembly.Opcode
-import millfork.error.ErrorReporting
 import millfork.node._
 
 sealed trait Thing {
@@ -79,6 +79,12 @@ sealed trait TypedThing extends Thing {
 
 sealed trait ThingInMemory extends Thing {
   def toAddress: Constant
+
+  var farFlag: Option[Boolean] = None
+  var declaredBank: Option[Int] = None
+
+  def isFar(compilationOptions: CompilationOptions): Boolean
+  def bank(compilationOptions: CompilationOptions): Int
 }
 
 sealed trait PreallocableThing extends ThingInMemory {
@@ -91,6 +97,12 @@ sealed trait PreallocableThing extends ThingInMemory {
 
 case class Label(name: String) extends ThingInMemory {
   override def toAddress: MemoryAddressConstant = MemoryAddressConstant(this)
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean =
+    compilationOptions.flag(CompilationFlag.LargeCode) && farFlag.getOrElse(true)
+
+  override def bank(compilationOptions: CompilationOptions): Int =
+    declaredBank.getOrElse(compilationOptions.platform.defaultCodeBank)
 }
 
 sealed trait Variable extends TypedThing with VariableLikeThing
@@ -100,8 +112,13 @@ case class BlackHole(typ: Type) extends Variable {
 }
 
 sealed trait VariableInMemory extends Variable with ThingInMemory with IndexableThing {
-
   def zeropage: Boolean
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean =
+    !zeropage && farFlag.getOrElse(false)
+
+  override def bank(compilationOptions: CompilationOptions): Int =
+    declaredBank.getOrElse(0)
 }
 
 case class RegisterVariable(register: Register.Value, typ: Type) extends Variable {
@@ -156,14 +173,26 @@ case class UninitializedArray(name: String, sizeInBytes: Int) extends MfArray wi
   override def toAddress: MemoryAddressConstant = MemoryAddressConstant(this)
 
   override def alloc = VariableAllocationMethod.Static
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean = farFlag.getOrElse(false)
+
+  override def bank(compilationOptions: CompilationOptions): Int = declaredBank.getOrElse(0)
 }
 
 case class RelativeArray(name: String, address: Constant, sizeInBytes: Int) extends MfArray {
   override def toAddress: Constant = address
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean = farFlag.getOrElse(false)
+
+  override def bank(compilationOptions: CompilationOptions): Int = declaredBank.getOrElse(0)
 }
 
 case class InitializedArray(name: String, address: Option[Constant], contents: List[Constant]) extends MfArray with PreallocableThing {
   override def shouldGenerate = true
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean = farFlag.getOrElse(false)
+
+  override def bank(compilationOptions: CompilationOptions): Int = declaredBank.getOrElse(0)
 }
 
 case class RelativeVariable(name: String, address: Constant, typ: Type, zeropage: Boolean) extends VariableInMemory {
@@ -198,6 +227,12 @@ case class MacroFunction(name: String,
 
 sealed trait FunctionInMemory extends MangledFunction with ThingInMemory {
   def environment: Environment
+
+  override def isFar(compilationOptions: CompilationOptions): Boolean =
+    compilationOptions.flag(CompilationFlag.LargeCode) && farFlag.getOrElse(true)
+
+  override def bank(compilationOptions: CompilationOptions): Int =
+    declaredBank.getOrElse(compilationOptions.platform.defaultCodeBank)
 }
 
 case class ExternFunction(name: String,
diff --git a/src/main/scala/millfork/node/Node.scala b/src/main/scala/millfork/node/Node.scala
index 7d26f4c7..ebaae039 100644
--- a/src/main/scala/millfork/node/Node.scala
+++ b/src/main/scala/millfork/node/Node.scala
@@ -61,7 +61,7 @@ case class HalfWordExpression(expression: Expression, hiByte: Boolean) extends E
 }
 
 object Register extends Enumeration {
-  val A, X, Y, AX, AY, YA, XA, XY, YX = Value
+  val A, X, Y, AX, AY, YA, XA, XY, YX, AW = Value
 }
 
 //case class Indexing(child: Expression, register: Register.Value) extends Expression
diff --git a/src/main/scala/millfork/output/Assembler.scala b/src/main/scala/millfork/output/Assembler.scala
index a52eca84..f222264c 100644
--- a/src/main/scala/millfork/output/Assembler.scala
+++ b/src/main/scala/millfork/output/Assembler.scala
@@ -1,6 +1,6 @@
 package millfork.output
 
-import millfork.assembly.opt.{AssemblyOptimization, JumpShortening}
+import millfork.assembly.opt.{AssemblyOptimization, HudsonOptimizations, JumpShortening}
 import millfork.assembly.{AddrMode, AssemblyLine, Opcode}
 import millfork.compiler.{CompilationContext, MfCompiler}
 import millfork.env._
@@ -116,6 +116,16 @@ class Assembler(private val program: Program, private val rootEnv: Environment)
     }
   }
 
+  def extractBank(c: Constant, options: CompilationOptions): Byte = {
+    c.quickSimplify match {
+      case NumericConstant(nn, _) => nn.>>(16).toInt.&(0xff).toByte
+      case MemoryAddressConstant(th) => th.bank(options).toByte
+      case CompoundConstant(MathOperator.Plus, a, b) => (extractBank(a, options) + extractBank(b, options)).toByte
+      case CompoundConstant(MathOperator.Minus, a, b) => (extractBank(a, options) - extractBank(b, options)).toByte
+      case _ => ErrorReporting.fatal("Failed to extract bank number from constant " + c)
+    }
+  }
+
   private def parseNormalToDecimalValue(a: Long): Long = {
     if (a < 0) -parseNormalToDecimalValue(-a)
     var x = a
@@ -314,7 +324,7 @@ class Assembler(private val program: Program, private val rootEnv: Environment)
     ErrorReporting.debug("Compiling: " + f.name, f.position)
     val unoptimized =
       MfCompiler.compile(CompilationContext(env = f.environment, function = f, extraStackOffset = 0, options = options)).flatMap {
-        case AssemblyLine(Opcode.JSR, _, p, true) if inlinedFunctions.contains(p.toString) =>
+        case AssemblyLine(Opcode.JSR, AddrMode.Absolute | AddrMode.LongAbsolute, p, true) if inlinedFunctions.contains(p.toString) =>
           val labelPrefix = MfCompiler.nextLabel("ai")
           inlinedFunctions(p.toString).map{
             case line@AssemblyLine(_, _, MemoryAddressConstant(Label(label)), _) =>
@@ -336,7 +346,10 @@ class Assembler(private val program: Program, private val rootEnv: Environment)
     val code = optimizations.foldLeft(unoptimized) { (c, opt) =>
       opt.optimize(f, c, options)
     }
-    if (optimizations.nonEmpty) JumpShortening(f, JumpShortening(f, code, options), options)
+    if (optimizations.nonEmpty) {
+      val finalCode = if (options.flag(CompilationFlag.EmitHudsonOpcodes)) HudsonOptimizations.removeLoadZero(code) else code
+      JumpShortening(f, JumpShortening(f, finalCode, options), options)
+    }
     else code
   }
 
@@ -361,14 +374,19 @@ class Assembler(private val program: Program, private val rootEnv: Environment)
           writeByte(0, index, Assembler.opcodeFor(op, Relative, options))
           writeByte(0, index + 1, param - (index + 2))
           index += 2
-        case AssemblyLine(op, am@(Immediate | ZeroPage | ZeroPageX | ZeroPageY | IndexedY | IndexedX | ZeroPageIndirect), param, _) =>
+        case AssemblyLine(op, am@(Immediate | ZeroPage | ZeroPageX | ZeroPageY | IndexedY | IndexedX | IndexedZ | LongIndexedY | LongIndexedZ | Stack), param, _) =>
           writeByte(0, index, Assembler.opcodeFor(op, am, options))
           writeByte(0, index + 1, param)
           index += 2
-        case AssemblyLine(op, am@(Absolute | AbsoluteY | AbsoluteX | Indirect | AbsoluteIndexedX), param, _) =>
+        case AssemblyLine(op, am@(WordImmediate | Absolute | AbsoluteY | AbsoluteX | Indirect | AbsoluteIndexedX), param, _) =>
           writeByte(0, index, Assembler.opcodeFor(op, am, options))
           writeWord(0, index + 1, param)
           index += 3
+        case AssemblyLine(op, am@(LongAbsolute | LongAbsoluteX | LongIndirect), param, _) =>
+          writeByte(0, index, Assembler.opcodeFor(op, am, options))
+          writeWord(0, index + 1, param)
+          writeByte(0, index + 3, extractBank(param, options))
+          index += 4
       }
     }
     index
@@ -379,44 +397,66 @@ object Assembler {
   val opcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
   val illegalOpcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
   val cmosOpcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
+  val cmosNopOpcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
+  val ce02Opcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
+  val hudsonOpcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
+  val emulation65816Opcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
+  val native65816Opcodes = mutable.Map[(Opcode.Value, AddrMode.Value), Byte]()
 
   def opcodeFor(opcode: Opcode.Value, addrMode: AddrMode.Value, options: CompilationOptions): Byte = {
     val key = opcode -> addrMode
-    opcodes.get(key) match {
-      case Some(v) => v
-      case None =>
-        illegalOpcodes.get(key) match {
-          case Some(v) =>
-            if (options.flag(CompilationFlag.EmitIllegals)) v
-            else ErrorReporting.fatal("Cannot assemble an illegal opcode " + key)
-          case None =>
-            cmosOpcodes.get(key) match {
-              case Some(v) =>
-                if (options.flag(CompilationFlag.EmitCmosOpcodes)) v
-                else ErrorReporting.fatal("Cannot assemble a CMOS opcode " + key)
-              case None =>
-                ErrorReporting.fatal("Cannot assemble an unknown opcode " + key)
-            }
-        }
-    }
+    opcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitIllegals)) illegalOpcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitCmosOpcodes)) cmosOpcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitCmosNopOpcodes)) cmosNopOpcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.Emit65CE02Opcodes)) ce02Opcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitHudsonOpcodes))  hudsonOpcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitEmulation65816Opcodes)) emulation65816Opcodes.get(key).foreach(return _)
+    if (options.flag(CompilationFlag.EmitNative65816Opcodes)) native65816Opcodes.get(key).foreach(return _)
+    ErrorReporting.fatal("Cannot assemble an unknown opcode " + key)
   }
 
   private def op(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
-    if (x < 0 || x > 0xff) ???
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
     opcodes(op -> am) = x.toByte
     if (am == AddrMode.Relative) opcodes(op -> AddrMode.Immediate) = x.toByte
   }
 
   private def cm(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
-    if (x < 0 || x > 0xff) ???
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
     cmosOpcodes(op -> am) = x.toByte
   }
 
+  private def cn(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
+    cmosNopOpcodes(op -> am) = x.toByte
+  }
+
   private def il(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
-    if (x < 0 || x > 0xff) ???
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
     illegalOpcodes(op -> am) = x.toByte
   }
 
+  private def hu(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
+    hudsonOpcodes(op -> am) = x.toByte
+  }
+
+  private def ce(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
+    ce02Opcodes(op -> am) = x.toByte
+  }
+
+  private def em(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
+    emulation65816Opcodes(op -> am) = x.toByte
+  }
+
+  private def na(op: Opcode.Value, am: AddrMode.Value, x: Int): Unit = {
+    if (x < 0 || x > 0xff) ErrorReporting.fatal("Invalid code for" + (op -> am))
+    native65816Opcodes(op -> am) = x.toByte
+  }
+
   def getStandardLegalOpcodes: Set[Int] = opcodes.values.map(_ & 0xff).toSet
 
   import AddrMode._
@@ -617,6 +657,7 @@ object Assembler {
   il(AHX, AbsoluteY, 0x9F)
   il(SAX, IndexedX, 0x83)
   il(AHX, IndexedY, 0x93)
+  il(SHY, AbsoluteX, 0x9C)
 
   il(ANC, Immediate, 0x0B)
   il(ALR, Immediate, 0x4B)
@@ -679,10 +720,10 @@ object Assembler {
   il(NOP, Absolute, 0x5C)
   il(NOP, AbsoluteX, 0x1C)
 
-  cm(NOP, Immediate, 0x02)
-  cm(NOP, ZeroPage, 0x44)
-  cm(NOP, ZeroPageX, 0x54)
-  cm(NOP, Absolute, 0x5C)
+  cn(NOP, Immediate, 0x02)
+  cn(NOP, ZeroPage, 0x44)
+  cn(NOP, ZeroPageX, 0x54)
+  cn(NOP, Absolute, 0x5C)
 
   cm(STZ, ZeroPage, 0x64)
   cm(STZ, ZeroPageX, 0x74)
@@ -694,14 +735,14 @@ object Assembler {
   cm(PLX, Implied, 0xFA)
   cm(PLY, Implied, 0x7A)
 
-  cm(ORA, ZeroPageIndirect, 0x12)
-  cm(AND, ZeroPageIndirect, 0x32)
-  cm(EOR, ZeroPageIndirect, 0x52)
-  cm(ADC, ZeroPageIndirect, 0x72)
-  cm(STA, ZeroPageIndirect, 0x92)
-  cm(LDA, ZeroPageIndirect, 0xB2)
-  cm(CMP, ZeroPageIndirect, 0xD2)
-  cm(SBC, ZeroPageIndirect, 0xF2)
+  cm(ORA, IndexedZ, 0x12)
+  cm(AND, IndexedZ, 0x32)
+  cm(EOR, IndexedZ, 0x52)
+  cm(ADC, IndexedZ, 0x72)
+  cm(STA, IndexedZ, 0x92)
+  cm(LDA, IndexedZ, 0xB2)
+  cm(CMP, IndexedZ, 0xD2)
+  cm(SBC, IndexedZ, 0xF2)
 
   cm(TSB, ZeroPage, 0x04)
   cm(TSB, Absolute, 0x0C)
@@ -717,4 +758,119 @@ object Assembler {
   cm(WAI, Implied, 0xCB)
   cm(STP, Implied, 0xDB)
 
+  ce(CPZ, Immediate, 0xC2)
+  ce(CPZ, ZeroPage, 0xD4)
+  ce(CPZ, Absolute, 0xDC)
+  ce(DEZ, Implied, 0x3B)
+  ce(INZ, Implied,0x1B )
+  ce(DEC_W, ZeroPage, 0xC3)
+  ce(INC_W, ZeroPage, 0xE3)
+  ce(ASL_W, Absolute, 0xCB)
+  // TODO: or is it ROL_W?
+  ce(ROR_W, Absolute, 0xEB)
+  ce(ASR, Implied, 0x43)
+  ce(ASR, ZeroPage, 0x44)
+  ce(ASR, ZeroPageX, 0x54)
+  ce(LDZ, Immediate, 0xA3)
+  ce(LDZ, Absolute, 0xAB)
+  ce(LDZ, AbsoluteX, 0xBB)
+  ce(TAB, Implied, 0x5B)
+  ce(TBA, Implied, 0x7B)
+  ce(TAZ, Implied, 0x4B)
+  ce(TZA, Implied, 0x6B)
+  ce(TSY, Implied, 0x0B)
+  ce(TYS, Implied, 0x2B)
+  ce(PHW, WordImmediate, 0xF4)
+  ce(PHW, Absolute, 0xFC)
+  ce(PHZ, Implied, 0xDB)
+  ce(PLZ, Implied, 0xFB)
+//  ce(CLE, Implied, )
+//  ce(SEE, Implied, )
+//  ce(BSR, , )
+
+  hu(CLY, Implied, 0xC2)
+  hu(CLX, Implied, 0x82)
+  hu(CLA, Implied, 0x62)
+  hu(CSH, Implied, 0xD4)
+  hu(CSL, Implied, 0x54)
+  hu(HuSAX, Implied, 0x22)
+  hu(SAY, Implied, 0x42)
+  hu(SXY, Implied, 0x02)
+  hu(TAM, Immediate, 0x53)
+  hu(TMA, Immediate, 0x43)
+
+  em(ORA, Stack, 0x03)
+  em(ORA, IndexedSY, 0x13)
+  na(ORA, LongIndexedZ, 0x07)
+  na(ORA, LongIndexedY, 0x17)
+  na(ORA, LongAbsolute, 0x0F)
+  na(ORA, LongAbsoluteX, 0x1F)
+  em(AND, Stack, 0x23)
+  em(AND, IndexedSY, 0x33)
+  na(AND, LongIndexedZ, 0x27)
+  na(AND, LongIndexedY, 0x37)
+  na(AND, LongAbsolute, 0x2F)
+  na(AND, LongAbsoluteX, 0x3F)
+  em(EOR, Stack, 0x43)
+  em(EOR, IndexedSY, 0x53)
+  na(EOR, LongIndexedZ, 0x47)
+  na(EOR, LongIndexedY, 0x57)
+  na(EOR, LongAbsolute, 0x4F)
+  na(EOR, LongAbsoluteX, 0x5F)
+  em(ADC, Stack, 0x63)
+  em(ADC, IndexedSY, 0x73)
+  na(ADC, LongIndexedZ, 0x67)
+  na(ADC, LongIndexedY, 0x77)
+  na(ADC, LongAbsolute, 0x6F)
+  na(ADC, LongAbsoluteX, 0x7F)
+  em(STA, Stack, 0x83)
+  em(STA, IndexedSY, 0x93)
+  na(STA, LongIndexedZ, 0x87)
+  na(STA, LongIndexedY, 0x97)
+  na(STA, LongAbsolute, 0x8F)
+  na(STA, LongAbsoluteX, 0x9F)
+  em(LDA, Stack, 0xA3)
+  em(LDA, IndexedSY, 0xB3)
+  na(LDA, LongIndexedZ, 0xA7)
+  na(LDA, LongIndexedY, 0xB7)
+  na(LDA, LongAbsolute, 0xAF)
+  na(LDA, LongAbsoluteX, 0xBF)
+  em(CMP, Stack, 0xA3)
+  em(CMP, IndexedSY, 0xB3)
+  na(CMP, LongIndexedZ, 0xA7)
+  na(CMP, LongIndexedY, 0xB7)
+  na(CMP, LongAbsolute, 0xAF)
+  na(CMP, LongAbsoluteX, 0xBF)
+
+  em(COP, Immediate, 0x02)
+  em(XBA, Implied, 0xEB)
+  em(TXY, Implied, 0x9B)
+  em(TYX, Implied, 0xBB)
+
+
+  na(RTL, Implied, 0x6B)
+  na(JMP, LongAbsolute, 0x5C)
+  na(JMP, LongIndirect, 0x7C)
+  na(BRL, LongRelative, 0x82)
+
+  em(PHD, Implied, 0x0B)
+  em(PLD, Implied, 0x2B)
+  em(PHB, Implied, 0x8B)
+  em(PLB, Implied, 0xAB)
+  em(PHK, Implied, 0x4B)
+
+  na(REP, Immediate, 0xC2)
+  na(SEP, Immediate, 0xE2)
+
+  na(XCE, Implied, 0xFB)
+  na(TCD, Implied, 0x5B)
+  na(TDC, Implied, 0x7B)
+  na(TSC, Implied, 0x3B)
+  na(TCS, Implied, 0x1B)
+
+  for {
+    ((narrow, am), code) <- emulation65816Opcodes ++ opcodes ++ cmosOpcodes ++ native65816Opcodes
+    wide <- Opcode.widen(narrow)
+  } na(wide, if (am == Immediate) WordImmediate else am, code & 0xff)
+
 }
diff --git a/src/main/scala/millfork/output/InliningCalculator.scala b/src/main/scala/millfork/output/InliningCalculator.scala
index b9f9537d..4b37f75c 100644
--- a/src/main/scala/millfork/output/InliningCalculator.scala
+++ b/src/main/scala/millfork/output/InliningCalculator.scala
@@ -76,12 +76,13 @@ object InliningCalculator {
     case _ => Nil
   }
 
-  private val badOpcodes = Set(RTI, RTS, JSR, BRK) ++ OpcodeClasses.ChangesStack
+  private val badOpcodes = Set(RTI, RTS, JSR, BRK, RTL, BSR) ++ OpcodeClasses.ChangesStack
   private val jumpingRelatedOpcodes = Set(LABEL, JMP) ++ OpcodeClasses.ShortBranching
 
   def codeForInlining(fname: String, code: List[AssemblyLine]): Option[List[AssemblyLine]] = {
     if (code.isEmpty) return None
-    if (code.last.opcode != RTS) return None
+    val lastOpcode = code.last.opcode
+    if (lastOpcode != RTS && lastOpcode != RTL) return None
     var result = code.init
     while (result.nonEmpty && OpcodeClasses.NoopDiscardsFlags(result.last.opcode)) {
       result = result.init
diff --git a/src/main/scala/millfork/parser/MfParser.scala b/src/main/scala/millfork/parser/MfParser.scala
index cf491c69..719fbefc 100644
--- a/src/main/scala/millfork/parser/MfParser.scala
+++ b/src/main/scala/millfork/parser/MfParser.scala
@@ -88,7 +88,7 @@ case class MfParser(filename: String, input: String, currentDirectory: String, o
     for {
       p <- position()
       minus <- "-".!.?
-      _ <- P("0x" | "$") ~/ Pass
+      _ <- P("0x" | "0X" | "$") ~/ Pass
       s <- CharsWhileIn("1234567890abcdefABCDEF", min = 1).!.opaque("<hex digits>")
     } yield {
       val abs = Integer.parseInt(s, 16)
@@ -96,7 +96,31 @@ case class MfParser(filename: String, input: String, currentDirectory: String, o
       LiteralExpression(value, size(value, s.length > 2, s.length > 4)).pos(p)
     }
 
-  val literalAtom: P[LiteralExpression] = binaryAtom | hexAtom | decimalAtom
+  val octalAtom: P[LiteralExpression] =
+    for {
+      p <- position()
+      minus <- "-".!.?
+      _ <- P("0o" | "0O") ~/ Pass
+      s <- CharsWhileIn("01234567", min = 1).!.opaque("<octal digits>")
+    } yield {
+      val abs = Integer.parseInt(s, 8)
+      val value = sign(abs, minus.isDefined)
+      LiteralExpression(value, size(value, s.length > 3, s.length > 6)).pos(p)
+    }
+
+  val quaternaryAtom: P[LiteralExpression] =
+    for {
+      p <- position()
+      minus <- "-".!.?
+      _ <- P("0q" | "0Q") ~/ Pass
+      s <- CharsWhileIn("0123", min = 1).!.opaque("<quaternary digits>")
+    } yield {
+      val abs = Integer.parseInt(s, 4)
+      val value = sign(abs, minus.isDefined)
+      LiteralExpression(value, size(value, s.length > 4, s.length > 8)).pos(p)
+    }
+
+  val literalAtom: P[LiteralExpression] = binaryAtom | hexAtom | octalAtom | quaternaryAtom | decimalAtom
 
   val atom: P[Expression] = P(literalAtom | (position() ~ identifier).map { case (p, i) => VariableExpression(i).pos(p) })
 
@@ -307,13 +331,19 @@ case class MfParser(filename: String, input: String, currentDirectory: String, o
 
   val commaX = HWS ~ "," ~ HWS ~ ("X" | "x") ~ HWS
   val commaY = HWS ~ "," ~ HWS ~ ("Y" | "y") ~ HWS
+  val commaZ = HWS ~ "," ~ HWS ~ ("Z" | "z") ~ HWS
+  val commaS = HWS ~ "," ~ HWS ~ ("S" | "s") ~ HWS
 
   def asmParameter: P[(AddrMode.Value, Expression)] = {
     (SWS ~ (
+      ("##" ~ asmExpression).map(AddrMode.WordImmediate -> _) |
       ("#" ~ asmExpression).map(AddrMode.Immediate -> _) |
         ("(" ~ HWS ~ asmExpression ~ HWS ~ ")" ~ commaY).map(AddrMode.IndexedY -> _) |
+        ("(" ~ HWS ~ asmExpression ~ commaS ~ ")" ~ commaY).map(AddrMode.IndexedSY -> _) |
+        ("(" ~ HWS ~ asmExpression ~ HWS ~ ")" ~ commaZ).map(AddrMode.IndexedZ -> _) |
         ("(" ~ HWS ~ asmExpression ~ commaX ~ ")").map(AddrMode.IndexedX -> _) |
         ("(" ~ HWS ~ asmExpression ~ HWS ~ ")").map(AddrMode.Indirect -> _) |
+        (asmExpression ~ commaS).map(AddrMode.Stack -> _) |
         (asmExpression ~ commaX).map(AddrMode.AbsoluteX -> _) |
         (asmExpression ~ commaY).map(AddrMode.AbsoluteY -> _) |
         asmExpression.map(AddrMode.Absolute -> _)
@@ -325,7 +355,13 @@ case class MfParser(filename: String, input: String, currentDirectory: String, o
   def asmInstruction: P[ExecutableStatement] = {
     val lineParser: P[(Boolean, Opcode.Value, (AddrMode.Value, Expression))] = !"}" ~ elidable ~/ asmOpcode ~/ asmParameter
     lineParser.map { case (elid, op, param) =>
-      AssemblyStatement(op, param._1, param._2, elid)
+      (op, param._1) match {
+        case (Opcode.SAX, AddrMode.Implied) => AssemblyStatement(Opcode.HuSAX, param._1, param._2, elid)
+        case (Opcode.SBX, AddrMode.Immediate) => AssemblyStatement(Opcode.SBX, param._1, param._2, elid)
+        case (Opcode.SAY, AddrMode.AbsoluteX) => AssemblyStatement(Opcode.SHY, param._1, param._2, elid)
+        case (Opcode.SBX, _) => AssemblyStatement(Opcode.SAX, param._1, param._2, elid)
+        case _ => AssemblyStatement(op, param._1, param._2, elid)
+      }
     }
   }
 
diff --git a/src/test/scala/millfork/test/StackVarSuite.scala b/src/test/scala/millfork/test/StackVarSuite.scala
index f9e65503..0ad2569c 100644
--- a/src/test/scala/millfork/test/StackVarSuite.scala
+++ b/src/test/scala/millfork/test/StackVarSuite.scala
@@ -1,6 +1,6 @@
 package millfork.test
 
-import millfork.test.emu.{EmuBenchmarkRun, EmuUnoptimizedRun}
+import millfork.test.emu.{EmuBenchmarkRun, EmuCmosBenchmarkRun}
 import org.scalatest.{FunSuite, Matchers}
 
 /**
@@ -9,7 +9,7 @@ import org.scalatest.{FunSuite, Matchers}
 class StackVarSuite extends FunSuite with Matchers {
 
   test("Basic stack assignment") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | byte output @$c000
         | void main () {
         |   stack byte a
@@ -23,7 +23,7 @@ class StackVarSuite extends FunSuite with Matchers {
   }
 
   test("Stack byte addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | byte output @$c000
         | void main () {
         |   stack byte a
@@ -41,7 +41,7 @@ class StackVarSuite extends FunSuite with Matchers {
   }
 
   test("Complex expressions involving stack variables") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | byte output @$c000
         | void main () {
         |   stack byte a
@@ -74,7 +74,7 @@ class StackVarSuite extends FunSuite with Matchers {
 //  }
 
   test("Stack word addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | void main () {
         |   stack word a
@@ -92,7 +92,7 @@ class StackVarSuite extends FunSuite with Matchers {
   }
 
   test("Recursion") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | array output [6] @$c000
         | byte fails @$c010
         | void main () {
@@ -129,7 +129,7 @@ class StackVarSuite extends FunSuite with Matchers {
 
 
   test("Indexing") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | array output [200] @$c000
         | void main () {
         |   stack byte a
diff --git a/src/test/scala/millfork/test/WordMathSuite.scala b/src/test/scala/millfork/test/WordMathSuite.scala
index b6e0ca6a..f8e774d2 100644
--- a/src/test/scala/millfork/test/WordMathSuite.scala
+++ b/src/test/scala/millfork/test/WordMathSuite.scala
@@ -1,5 +1,5 @@
 package millfork.test
-import millfork.test.emu.EmuBenchmarkRun
+import millfork.test.emu.{EmuBenchmarkRun, EmuCmosBenchmarkRun}
 import org.scalatest.{FunSuite, Matchers}
 
 /**
@@ -8,7 +8,7 @@ import org.scalatest.{FunSuite, Matchers}
 class WordMathSuite extends FunSuite with Matchers {
 
   test("Word addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | word a
         | void main () {
@@ -20,7 +20,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("Word subtraction") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | word a
         | void main () {
@@ -32,7 +32,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("Word subtraction 2") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | word a
         | void main () {
@@ -44,7 +44,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("Byte-to-word addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | word pair
         | void main () {
@@ -57,7 +57,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("Literal addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | void main () {
         |  output = 640
@@ -67,7 +67,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("Array element addition") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | word output @$c000
         | word pair
         | array b[2]
@@ -84,7 +84,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("nesdev.com example") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | byte output @$c000
         | array map [256] @$c300
         | array b[2]
@@ -102,7 +102,7 @@ class WordMathSuite extends FunSuite with Matchers {
   }
 
   test("hi()/lo()") {
-    EmuBenchmarkRun("""
+    EmuCmosBenchmarkRun("""
         | array output [7] @$c000
         | void main () {
         |   output[0] = lo(33)
diff --git a/src/test/scala/millfork/test/emu/EmuCmosBenchmarkRun.scala b/src/test/scala/millfork/test/emu/EmuCmosBenchmarkRun.scala
index 77f14579..e59a9631 100644
--- a/src/test/scala/millfork/test/emu/EmuCmosBenchmarkRun.scala
+++ b/src/test/scala/millfork/test/emu/EmuCmosBenchmarkRun.scala
@@ -7,9 +7,18 @@ import millfork.output.MemoryBank
   */
 object EmuCmosBenchmarkRun {
   def apply(source:String)(verifier: MemoryBank=>Unit) = {
+    println(f"Compiling for NMOS (unoptimized)")
     val (Timings(_, t0), m0) = EmuUnoptimizedRun.apply2(source)
+    println(f"Compiling for NMOS")
     val (Timings(_, t1), m1) = EmuOptimizedRun.apply2(source)
+    println(f"Compiling for CMOS")
     val (Timings(_, t2), m2) = EmuOptimizedCmosRun.apply2(source)
+    println(f"Compiling for HuC6280")
+    EmuOptimizedHudsonRun.apply2(source)
+    println(f"Compiling for 65CE02")
+    EmuOptimized65CE02Run.apply2(source)
+    println(f"Compiling for 65816")
+    EmuOptimized65816Run.apply2(source)
     println(f"Before optimization:      $t0%7d")
     println(f"After NMOS optimization:  $t1%7d")
     println(f"After CMOS optimization:  $t2%7d")
diff --git a/src/test/scala/millfork/test/emu/EmuOptimized65816Run.scala b/src/test/scala/millfork/test/emu/EmuOptimized65816Run.scala
new file mode 100644
index 00000000..134eb254
--- /dev/null
+++ b/src/test/scala/millfork/test/emu/EmuOptimized65816Run.scala
@@ -0,0 +1,21 @@
+package millfork.test.emu
+
+import millfork.assembly.opt.{CmosOptimizations, SixteenOptimizations}
+import millfork.{Cpu, OptimizationPresets}
+
+/**
+  * @author Karol Stasiak
+  */
+object EmuOptimized65816Run extends EmuRun(
+  Cpu.Sixteen,
+  OptimizationPresets.NodeOpt,
+  OptimizationPresets.AssOpt ++
+    CmosOptimizations.All ++ SixteenOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ SixteenOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ SixteenOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ SixteenOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ SixteenOptimizations.All ++ OptimizationPresets.Good,
+  false)
+
+
+
diff --git a/src/test/scala/millfork/test/emu/EmuOptimized65CE02Run.scala b/src/test/scala/millfork/test/emu/EmuOptimized65CE02Run.scala
new file mode 100644
index 00000000..442fcb58
--- /dev/null
+++ b/src/test/scala/millfork/test/emu/EmuOptimized65CE02Run.scala
@@ -0,0 +1,19 @@
+package millfork.test.emu
+
+import millfork.assembly.opt.{CE02Optimizations, CmosOptimizations}
+import millfork.{Cpu, OptimizationPresets}
+
+/**
+  * @author Karol Stasiak
+  */
+object EmuOptimized65CE02Run extends EmuRun(
+  Cpu.CE02,
+  OptimizationPresets.NodeOpt,
+  OptimizationPresets.AssOpt ++
+    CmosOptimizations.All ++ CE02Optimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ CE02Optimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ CE02Optimizations.All ++ OptimizationPresets.Good,
+  false)
+
+
+
diff --git a/src/test/scala/millfork/test/emu/EmuOptimizedHudsonRun.scala b/src/test/scala/millfork/test/emu/EmuOptimizedHudsonRun.scala
new file mode 100644
index 00000000..76f403a5
--- /dev/null
+++ b/src/test/scala/millfork/test/emu/EmuOptimizedHudsonRun.scala
@@ -0,0 +1,19 @@
+package millfork.test.emu
+
+import millfork.assembly.opt.{CmosOptimizations, HudsonOptimizations}
+import millfork.{Cpu, OptimizationPresets}
+
+/**
+  * @author Karol Stasiak
+  */
+object EmuOptimizedHudsonRun extends EmuRun(
+  Cpu.HuC6280,
+  OptimizationPresets.NodeOpt,
+  OptimizationPresets.AssOpt ++
+    CmosOptimizations.All ++ HudsonOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ HudsonOptimizations.All ++ OptimizationPresets.Good ++
+    CmosOptimizations.All ++ HudsonOptimizations.All ++ OptimizationPresets.Good,
+  false)
+
+
+
diff --git a/src/test/scala/millfork/test/emu/EmuRun.scala b/src/test/scala/millfork/test/emu/EmuRun.scala
index e72bda48..3daaa33b 100644
--- a/src/test/scala/millfork/test/emu/EmuRun.scala
+++ b/src/test/scala/millfork/test/emu/EmuRun.scala
@@ -97,7 +97,10 @@ class EmuRun(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimization],
       CompilationFlag.DetailedFlowAnalysis -> quantum,
       CompilationFlag.InlineFunctions -> this.inline,
       CompilationFlag.CompactReturnDispatchParams -> true,
-      CompilationFlag.EmitCmosOpcodes -> (platform.cpu == millfork.Cpu.Cmos),
+      CompilationFlag.EmitCmosOpcodes -> millfork.Cpu.CmosCompatible.contains(platform.cpu),
+      CompilationFlag.EmitEmulation65816Opcodes -> (platform.cpu == millfork.Cpu.Sixteen),
+      CompilationFlag.Emit65CE02Opcodes -> (platform.cpu == millfork.Cpu.CE02),
+      CompilationFlag.EmitHudsonOpcodes -> (platform.cpu == millfork.Cpu.HuC6280),
       CompilationFlag.OptimizeForSonicSpeed -> blastProcessing
       //      CompilationFlag.CheckIndexOutOfBounds -> true,
     ))
@@ -167,8 +170,11 @@ class EmuRun(cpu: millfork.Cpu.Value, nodeOptimizations: List[NodeOptimization],
           case millfork.Cpu.Mos =>
             ErrorReporting.fatal("There's no NMOS emulator with decimal mode support")
             Timings(-1, -1) -> memoryBank
-          case _ =>
+          case millfork.Cpu.StrictMos | millfork.Cpu.StrictRicoh =>
             runViaSymon(memoryBank, platform.org, CpuBehavior.NMOS_6502)
+          case _ =>
+            ErrorReporting.trace("No emulation support for " + platform.cpu)
+            Timings(-1, -1) -> memoryBank
         }
       case f: Failure[_, _] =>
         println(f)
diff --git a/src/test/scala/millfork/test/emu/SymonTestRam.scala b/src/test/scala/millfork/test/emu/SymonTestRam.scala
index 0ae21e28..2e9e6c5a 100644
--- a/src/test/scala/millfork/test/emu/SymonTestRam.scala
+++ b/src/test/scala/millfork/test/emu/SymonTestRam.scala
@@ -10,8 +10,9 @@ class SymonTestRam(mem: MemoryBank) extends Device(0x0000, 0xffff, "RAM") {
 
   mem.readable(1) = true
   mem.readable(2) = true
+  mem.readable(0x23) = true
 
-  (0x100 to 0x1ff).foreach { stack =>
+  (0 to 0x1ff).foreach { stack =>
     mem.writeable(stack) = true
     mem.readable(stack) = true
   }