commit 5d5ddba7f10596fc574fb0639012d720b9d66c62
Author: Steven Flintham <sgf@lemma.co.uk>
Date:   Wed Jun 25 18:47:24 2014 +0100

    First public release

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1be6b44
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,42 @@
+*~
+*.o
+*.lo
+.deps
+.libs
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+config.guess
+config.h
+config.log
+config.status
+config.sub
+configure
+depcomp
+examples/.dirstamp
+examples/lib1
+install-sh
+lib6502-jit*
+lib6502-jit*
+libtool
+ltmain.sh
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+missing
+run6502
+stamp-h1
+test/.dirstamp
+test/*.mc
+test/basic-callback
+test/call-illegal-callback-modify-code
+test/irq-nmi
+test/setjmp-trick
+test/stack-code-brk
+test/stack-code-jsr
+test/write-callback-modify-code
+test/z-self-modify-1.mc
+test/z-self-modify-1.out
diff --git a/AddressRange.cpp b/AddressRange.cpp
new file mode 100644
index 0000000..1ec95cb
--- /dev/null
+++ b/AddressRange.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "AddressRange.h"
+
+#include <assert.h>
+
+#include "const.h"
+
+AddressRange::AddressRange(uint16_t addr)
+: range_begin_(addr), range_end_(range_begin_ + 1)
+{
+}
+
+AddressRange::AddressRange(uint32_t range_begin, uint32_t range_end)
+: range_begin_(range_begin), range_end_(range_end)
+{
+    assert(range_begin_ < memory_size);
+    assert(range_end_ <= (memory_size + 0xff));
+    assert(range_begin_ < range_end_);
+}
+
+bool AddressRange::all_memory() const
+{
+    // This doesn't catch some degenerate cases (e.g. range_begin_ = 0x1,
+    // range_end_ = 0x10002) but that doesn't matter.
+    return (range_begin_ == 0) && (range_end_ == memory_size);
+}
diff --git a/AddressRange.h b/AddressRange.h
new file mode 100644
index 0000000..f03744a
--- /dev/null
+++ b/AddressRange.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+// An AddressRange represents a contiguous range of addresses in the emulated
+// memory, expressed as a half-open interval ("begin" is included, "end" is
+// excluded). To allow convenient handling of cases where addresses wrap around
+// at the top of memory, end may be as large as 0x100ff; this allows the
+// effective address range of an instruction like LDA &ffff,Y to be represented.
+// (The "largest" address accessed is &00fe, and since the interval is half-open
+// end needs to allow a value one larger.)
+
+#ifndef ADDRESSRANGE_H
+#define ADDRESSRANGE_H
+
+#include <stdint.h>
+
+class AddressRange
+{
+public:
+    // Convenience function; equivalent to AddressRange(addr, addr + 1) without
+    // any need to worry about whether addr + 1 will wrap to 0.
+    AddressRange(uint16_t addr);
+
+    AddressRange(uint32_t range_begin, uint32_t range_end);
+
+    uint32_t range_begin() const
+    {
+        return range_begin_;
+    }
+
+    uint32_t range_end() const
+    {
+        return range_end_;
+    }
+
+    // Return true iff AddressRange covers the whole of memory.
+    bool all_memory() const;
+
+    class const_iterator
+    {
+    friend class AddressRange;
+
+    public:
+        uint16_t operator*() const
+        {
+            // Truncating down to 16 bits gives exactly the behaviour we
+            // require if this is a range which uses values >= 0x10000 to
+            // indicate wrapping around to the start of memory.
+            return static_cast<uint16_t>(v_);
+        }
+
+        const_iterator &operator++()
+        {
+            ++v_;
+            return *this;
+        }
+
+        bool operator!=(const const_iterator &rhs)
+        {
+            return v_ != rhs.v_;
+        }
+
+    private:
+        const_iterator(uint32_t v)
+        : v_(v)
+        {
+        }
+
+        uint32_t v_;
+    };
+
+    const_iterator begin() const
+    {
+        return const_iterator(range_begin_);
+    }
+
+    const_iterator end() const
+    {
+        return const_iterator(range_end_);
+    }
+
+private:
+    uint32_t range_begin_;
+    uint32_t range_end_;
+};
+
+#endif
diff --git a/AddressSet.cpp b/AddressSet.cpp
new file mode 100644
index 0000000..971ba5a
--- /dev/null
+++ b/AddressSet.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "AddressSet.h"
+
+#include <assert.h>
+#include <sstream>
+#include <stddef.h>
+
+#include "AddressRange.h"
+#include "util.h"
+
+void AddressSet::insert(uint16_t address)
+{
+    set_.insert(address);
+}
+
+void AddressSet::insert(const AddressRange &range)
+{
+    for (AddressRange::const_iterator it = range.begin(); it != range.end(); 
+         ++it)
+    {
+        set_.insert(*it);
+    }
+}
+
+namespace
+{
+    std::string dump_range(uint32_t range_start, uint32_t range_end)
+    {
+        std::stringstream s;
+        s << std::hex << std::setfill('0');
+        if ((range_start + 1) == range_end)
+        {
+            s << "0x" << std::setw(4) << range_start;
+        }
+        else
+        {
+            // It's probably more readable to dump in this (inclusive) format
+            // than to insist on using the half-open intervals which are
+            // "natural" in the code itself.
+            s << "0x" << std::setw(4) << range_start << "-" <<
+                 "0x" << std::setw(4) << (range_end - 1);
+        }
+        return s.str();
+    }
+}
+
+std::string AddressSet::dump(int indent) const
+{
+    std::stringstream s;
+
+    bool in_range = false;
+    uint32_t range_start;
+    uint32_t range_last;
+    for (AddressSet::const_iterator it = set_.begin(); it != set_.end(); ++it)
+    {
+        uint16_t i = *it;
+        if (!in_range)
+        {
+            range_start = i;
+            range_last = i;
+            in_range = true;
+        }
+        else
+        {
+            if (i != (range_last + 1))
+            {
+                s << spaces(indent) << 
+                     dump_range(range_start, range_last + 1) << "\n";
+                range_start = i;
+            }
+            range_last = i;
+        }
+    }
+    if (in_range)
+    {
+        s << spaces(indent) << dump_range(range_start, range_last + 1) << "\n";
+    }
+    return s.str();
+}
diff --git a/AddressSet.h b/AddressSet.h
new file mode 100644
index 0000000..d9d8ef4
--- /dev/null
+++ b/AddressSet.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef ADDRESSSET_H
+#define ADDRESSSET_H
+
+#include <set>
+#include <stdint.h>
+#include <string>
+
+class AddressRange;
+
+class AddressSet
+{
+private:
+    // This might not be the perfect representation, but it's simple and clean,
+    // so let's stick with it unless profiling shows this is a problem.
+    typedef std::set<uint16_t> Container;
+
+public:
+    AddressSet()
+    {
+    }
+
+    void insert(uint16_t address);
+
+    void insert(const AddressRange &range);
+
+    typedef Container::const_iterator const_iterator;
+
+    const_iterator begin() const
+    {
+        return set_.begin();
+    }
+
+    const_iterator end() const
+    {
+        return set_.end();
+    }
+
+    Container::size_type size() const
+    {
+        return set_.size();
+    }
+
+    std::string dump(int indent) const;
+
+private:
+    std::set<uint16_t> set_;
+};
+
+#endif
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..2cf8818
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,60 @@
+TL;DR: If you're redistributing this you should read through the text below and
+examine the headers on the individual files, but basically the C/C++ source
+code (with the exception of valgrind.h, which can be removed if necessary) was
+all written by Ian Piumarta or Steven Flintham and is licensed under the "MIT
+(X11 flavour)" licence at the bottom of this file, just as lib6502 itself is.
+The autotools infrastructure support is GPL licensed but has exceptions for use
+(as is the case here) in autoconfigured packages.
+
+
+
+valgrind.h has its own license; see the comments at the top of that file.
+
+build-aux/tap-driver.sh (used as part of "make check") is GPLv2 licensed with
+an exception (which I believe applies to this package) allowing distribution
+under "the same distribution terms that you use for the rest of that program".
+See the comments at the top of that file for more details.
+
+m4/boost.m4 (used to autoconfigure the build against the Boost libraries) is
+GPLv3 licensed with an exception (which I believe applies to this package)
+allowing distribution under "terms of your choice". See the comments at the top
+of that file for more details.
+
+The text below is from Ian Piumarta's lib6502's COPYING file. lib6502-jit
+contains almost all of the code and documentation from lib6502 itself.
+
+As the author of the remaining parts of lib6502-jit, I am granting the same
+permissions and have added my own copyright notice, but the text below is
+otherwise unchanged. 
+
+-- Steven Flintham
+
+
+
+Distasteful though it is for me to have to induce from afar any perturbation
+into your pursuit of happiness, this MIT (X11 flavour) license is at least
+relatively benign.  Investigation into copyright stupidity reveals that it is
+effectively impossible to dedicate (formally) any software to the public
+domain (the only sure path to this most enlightened status being to leave the
+software to expire naturally from its 25-, 50-, 75- or whatever-year copyright
+rot).  I fear this is not going to change before the revolution comes.  In the
+meantime the only way I can *guarantee* you any rights at all to this software
+would (unfortunately) appear to be...
+
+  Copyright (c) 2005 Ian Piumarta
+  Copyright (c) 2014 Steven Flintham
+
+  All rights reserved.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the 'Software'), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, and/or sell copies
+  of the Software, and to permit persons to whom the Software is furnished to
+  do so, provided that the above copyright notice(s) and this permission
+  notice appear in all copies or substantial portions of the Software.
+
+  Inclusion of the above copyright notice(s) and this permission notice in
+  supporting documentation would be appreciated, but is not required.
+
+  THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
diff --git a/CREDITS b/CREDITS
new file mode 100644
index 0000000..c7d726b
--- /dev/null
+++ b/CREDITS
@@ -0,0 +1,33 @@
+lib6502-jit implements (nearly) the same API as Ian Piumarta's lib6502
+(http://www.piumarta.com/software/lib6502/) and includes virtually all of
+lib6502's code and documentation with only minor modifications; the lib6502
+emulation code is used to implement the interpreted and hybrid emulation modes
+in lib6502-jit. The contents of the examples and man directories are almost
+verbatim copies of those in lib6502. Thanks to Ian for making lib6502
+available. Please do not send bug reports regarding lib6502-jit to Ian!
+
+This distribution itself doesn't contain any LLVM code, but obviously without
+the LLVM project lib6502-jit could not exist.
+
+valgrind.h is taken from Valgrind (http://valgrind.org/).
+
+build-aux/tap-driver.sh is part of GNU Automake and was taken from
+https://raw.githubusercontent.com/kergoth/automake/master/lib/tap-driver.sh.
+
+m4/boost.m4 (used to autoconfigure the build against the Boost libraries) is
+taken from https://github.com/tsuna/boost.m4.
+
+While I'd be lying if I said I enjoyed working with Autotools, I am grateful
+for the work people have put in to make it possible to build packages portably
+on a range of different platforms.
+
+The technique (but not the code) used to translate a JITted function's machine
+code into assembly in Function::dump_machine_code() is taken from the libjit
+(https://www.gnu.org/software/libjit/) dump_object_code() function.
+
+The algorithm used to implement ADC/SDC in decimal mode is taken from
+http://www.6502.org/tutorials/decimal_mode.html. The test program on the same
+page was used to validate the implementation.
+
+Klaus Dormann's "6502 functional test" and "65C02 extended opcodes test" were
+used to validate the behaviour of the emulation.
diff --git a/Function.cpp b/Function.cpp
new file mode 100644
index 0000000..d766bec
--- /dev/null
+++ b/Function.cpp
@@ -0,0 +1,417 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "Function.h"
+
+#include <errno.h>
+#include <sstream>
+#include <stdexcept>
+#include <unistd.h>
+#include "valgrind.h"
+
+#include "const.h"
+#include "LLVMStuff.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+// Note that we call update_memory_snapshot() after invoking callbacks here, but
+// not before. It would be correct to do so, but it's not necessary. Firstly, we
+// arrange that the memory snapshot is kept up-to-date during execution under
+// our control (i.e. not involving callbacks), so it isn't necessary. Secondly,
+// even if it were necessary, it would be redundant, since any actions needed
+// as a result of the update can wait until after the callback is called and the
+// call after the callback would perform them.
+
+namespace
+{
+    // We have the callback_pc argument to allow us to special-case the
+    // contents of the PC register for lib6502 compatibility. Without this
+    // we would always pass registers.pc, which is "address of the next
+    // instruction to execute if the callback doesn't intervene" in PC;
+    // this agrees with lib6502 for JMP (absolute and indirect) but not for JSR
+    // or BRK.
+    uint16_t handle_call_callback(M6502 *mpu, uint16_t callback_pc, 
+                                  uint8_t opcode)
+    {
+        Registers &registers = mpu->internal->registers_;
+        uint16_t default_next_pc = registers.pc;
+        if (mpu->callbacks->call[registers.pc] != 0)
+        {
+            registers.pc = callback_pc;
+            registers.to_M6502_Registers(mpu);
+            TRACE("Call callback, mpu " << mpu << ", address 0x" << std::hex << 
+                  std::setfill('0') << std::setw(4) << default_next_pc << 
+                  ", data 0x" << std::setw(2) << static_cast<int>(opcode));
+            uint16_t address = default_next_pc;
+            if (opcode == opcode_brk)
+            {
+                address = callback_pc - 2; // lib6502 does this
+            }
+            int callback_result = 
+                mpu->callbacks->call[default_next_pc](mpu, address, opcode);
+            TRACE("Callback returned 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << callback_result);
+            registers.from_M6502_Registers(mpu);
+            mpu->internal->function_manager_.update_memory_snapshot();
+            if (callback_result != 0)
+            {
+                return callback_result;
+            }
+        }
+        return default_next_pc;
+    }
+
+    uint16_t get_stacked_pc(M6502 *mpu, int offset)
+    {
+        uint8_t s = mpu->internal->registers_.s;
+
+        for (; offset > 0; --offset)
+        {
+            ++s;
+        }
+
+        ++s;
+        uint8_t pushed_pc_low = mpu->memory[0x100 + s];
+        ++s;
+        uint8_t pushed_pc_high = mpu->memory[0x100 + s];
+        return pushed_pc_low | (pushed_pc_high << 8);
+    }
+
+    uint16_t handle_push_and_control_transfer_opcode(
+        M6502 *mpu, uint16_t callback_pc, uint8_t opcode, int bytes_pushed)
+    {
+        assert(bytes_pushed >= 2);
+
+        uint8_t s = mpu->internal->registers_.s;
+        for (int i = 0; i < bytes_pushed; ++i)
+        {
+            ++s;
+            mpu->internal->function_manager_.code_modified_at(0x100 + s);
+        }
+
+        return handle_call_callback(mpu, callback_pc, opcode);
+    }
+}
+
+Function::Function(
+    M6502 *mpu, uint16_t address, const AddressSet &code_range, 
+    const AddressSet &optimistic_writes, llvm::Function *llvm_function)
+: mpu_(mpu),
+  llvm_stuff_(mpu->internal->llvm_stuff_),
+  address_(address),
+  code_range_(code_range),
+  optimistic_writes_(optimistic_writes),
+  llvm_function_(llvm_function),
+  jitted_function_(reinterpret_cast<Function::JitFunction>(
+    llvm_stuff_.execution_engine_->getPointerToFunction(llvm_function)))
+{
+    llvm_stuff_.execution_engine_->runJITOnFunction(llvm_function_, &mci_);
+}
+
+Function::~Function()
+{
+    TRACE("Destructor for Function at address " << std::hex << 
+          std::setfill('0') << std::setw(4) << address_);
+    
+    VALGRIND_DISCARD_TRANSLATIONS(mci_.address(), mci_.size());
+    llvm_function_->eraseFromParent();
+}
+
+void Function::handle_complex_result(FunctionBuilder::Result result) const
+{
+    Registers &registers = mpu_->internal->registers_;
+
+    switch (result)
+    {
+        case FunctionBuilder::result_control_transfer_direct:
+            CANT_HAPPEN("Direct case reached handle_complex_result()");
+
+        case FunctionBuilder::result_control_transfer_indirect:
+            registers.pc = handle_call_callback(mpu_, registers.pc, 
+                                                registers.data);
+            break;
+
+        case FunctionBuilder::result_brk:
+            registers.pc = handle_push_and_control_transfer_opcode(
+                mpu_, get_stacked_pc(mpu_, 1), opcode_brk, 3);
+            break;
+
+        case FunctionBuilder::result_jsr_complex:
+            registers.pc = handle_push_and_control_transfer_opcode(
+                mpu_, get_stacked_pc(mpu_, 0) + 1, opcode_jsr, 2);
+            break;
+
+        case FunctionBuilder::result_illegal_instruction:
+        {
+            registers.to_M6502_Registers(mpu_);
+            TRACE("Illegal instruction callback, mpu " << mpu_ << 
+                  ", address 0x" << std::hex << std::setfill('0') << 
+                  std::setw(4) << registers.addr << ", data 0x" << 
+                  std::setw(2) << static_cast<int>(registers.data));
+            uint16_t new_pc = 
+                mpu_->callbacks->illegal_instruction[registers.data](
+                    mpu_, registers.addr, registers.data);
+            TRACE("Callback returned 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << new_pc);
+            registers.from_M6502_Registers(mpu_);
+            mpu_->internal->function_manager_.update_memory_snapshot();
+            if (new_pc != 0)
+            {
+                registers.pc = new_pc;
+            }
+            break;
+        }
+
+        case FunctionBuilder::result_write_to_code:
+            TRACE("Code modified at 0x" << std::hex << std::setfill('0') << 
+                  std::setw(4) << registers.addr);
+            mpu_->internal->function_manager_.code_modified_at(registers.addr);
+            break;
+
+        case FunctionBuilder::result_write_callback:
+        {
+            TRACE("Write callback at 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << registers.addr << " with data 0x" << 
+                  std::setw(4) << static_cast<int>(registers.data));
+            // We *don't* invoke Registers.{to,from}_M6502Registers() before
+            // and after the callback. We could do this, but lib6502 itself
+            // (and therefore the lib6502 code used for interpreting in
+            // lib6502-jit) doesn't do that, so this could be confusing
+            // for client code. (For example, a callback might be written
+            // to rely on this, it would work if called from compiled code
+            // but wouldn't work if called from interpreted mode. So its
+            // behaviour in hybrid mode would be random.)
+            (void) mpu_->callbacks->write[registers.addr](
+                mpu_, registers.addr, registers.data);
+            mpu_->internal->function_manager_.update_memory_snapshot();
+            break;
+        }
+
+        case FunctionBuilder::result_invalid_bounds:
+            CANT_HAPPEN("Invalid bounds inside Function for address 0x" <<
+                        std::hex << std::setfill('0') << std::setw(4) <<
+                        address_);
+
+        default:
+            CANT_HAPPEN("Unknown result " << result << " from JIT function");
+    }
+}
+
+#ifdef LOG
+
+namespace
+{
+    std::string indent(int n, const std::string &s)
+    {
+        std::string prefix = spaces(n);
+        return apply_prefix(prefix, s);
+    }
+}
+
+std::string Function::dump_all() const
+{
+    std::stringstream s;
+    s << "Function at 0x" << std::hex << std::setfill('0') << std::setw(4) <<
+         address_ << ":\n";
+    s << spaces(1) << "Code range:\n" << code_range_.dump(2) << "\n";
+    s << spaces(1) << "Optimistic writes at:\n" << optimistic_writes_.dump(2) <<
+         "\n";
+    s << spaces(1) << "6502 machine code:\n" << indent(2, disassembly_) << "\n";
+    s << spaces(1) << "Unoptimised IR:\n" << indent(2, unoptimised_ir_) << "\n";
+    s << spaces(1) << "Optimised IR:\n" << indent(2, optimised_ir_) << "\n";;
+    s << spaces(1) << "Host machine code:\n" << indent(2, dump_machine_code());
+    return s.str();
+}
+
+#endif
+
+namespace
+{
+    template <class Handle, class CloseFnType, CloseFnType close_fn>
+    class AutoClose : boost::noncopyable
+    {
+    public:
+        AutoClose(Handle h)
+        : open_(true), h_(h)
+        {
+        }
+
+        int close()
+        {
+            open_ = false;
+            return close_fn(h_);
+        }
+
+        ~AutoClose()
+        {
+            if (open_)
+            {
+                close_fn(h_); // ignore return code, nothing we can do if it fails
+            }
+        }
+
+    private:
+        bool open_;
+        Handle h_;
+    };
+
+    typedef int (*FdClose)(int);
+    typedef AutoClose<int, FdClose, ::close> FdAutoClose;
+    typedef int (*PopenClose)(FILE *);
+    typedef AutoClose<FILE *, PopenClose, ::pclose> PopenAutoClose;
+}
+
+#ifdef LOG
+
+std::string Function::dump_machine_code() const
+{
+    try
+    {
+        // What a performance! The basic idea of outputting .bytes directives,
+        // assembling those and then disassembling the result is taken from
+        // libjit's dump_object_code(); the implementation is not copied.
+
+        char as_output_file[] = "/tmp/lib6502-jit-XXXXXX";
+
+        errno = 0;
+
+        // mkstemp() creates a unique filename and opens it. We unlink the file
+        // immediately so it has no name; this minimises (but does not
+        // eliminate; we might be killed between mkstemp() and unlink()) the
+        // chance of the file being left lying around. Since we need a name for
+        // the 'as' and 'objdump' commands, we use /dev/fd/nn to refer to it
+        // afterwards.
+        int fd = mkstemp(as_output_file);
+        if (fd == -1)
+        {
+            fail_errno_or("mkstemp() failed");
+        }
+        FdAutoClose auto_close_fd(fd);
+        if (unlink(as_output_file) == -1)
+        {
+            fail_errno_or("unlink() failed");
+        }
+
+        {
+            std::stringstream as_command;
+            as_command << "as -o /dev/fd/" << fd << " 2>/dev/null";
+            FILE *f = popen(as_command.str().c_str(), "w");
+            if (f == 0)
+            {
+                fail_errno_or("popen() failed (for 'as')");
+            }
+            PopenAutoClose auto_close_f(f);
+            unsigned char *p = static_cast<unsigned char *>(mci_.address());              
+            unsigned char *end = p + mci_.size();                                         
+            for (; p < end; ++p)                                                         
+            {                                                                            
+                if (fprintf(f, ".byte %d\n", *p) < 0)
+                {
+                    fail("Error writing to 'as' pipe");
+                }
+            }                                                                            
+            if (auto_close_f.close() != 0)
+            {
+                fail_errno_or("Error closing 'as' pipe");
+            }
+        }
+
+        if (lseek(fd, 0, SEEK_SET) == static_cast<off_t>(-1))
+        {
+            fail_errno_or("Error seeking on temporary file");
+        }
+
+        std::stringstream objdump_command;
+        // As far as I can tell, there's no guarantee how mci_.address() [a
+        // pointer type] will be represented in the stringstream, but in
+        // practice this code is not very portable anyway and this is the least
+        // of our worries...
+        objdump_command << "objdump --adjust-vma=" << 
+                           mci_.address() << " -d /dev/fd/" << fd << " 2>&1";
+        FILE *g = popen(objdump_command.str().c_str(), "r");
+        if (g == 0)
+        {
+            fail_errno_or("popen() failed (for 'objdump')");
+        }
+        PopenAutoClose auto_close_g(g);
+
+        std::stringstream code;
+        char buffer[1024];
+        size_t bytes_read;
+        while ((bytes_read = fread(buffer, 1, sizeof(buffer), g)) > 0)
+        {
+            code << std::string(buffer, bytes_read);
+        }
+        if (ferror(g))
+        {
+            fail("Error reading from 'objdump' pipe");
+        }
+        if (auto_close_g.close() != 0)
+        {
+            fail_errno_or("Error closing 'objdump' pipe");
+        }
+        if (auto_close_fd.close() != 0)
+        {
+            fail_errno_or("Error closing temporary file");
+        }
+
+        return code.str();
+    }
+    catch (std::exception &e)
+    {
+        // Dumping out the generated machine code is decidedly not critical, so
+        // we don't allow the exception to propagate.
+        return std::string("Unable to dump machine code: ") + e.what();
+    }
+}
+
+void Function::fail(const std::string &error) const
+{
+    throw std::runtime_error(error);
+}
+
+void Function::fail_errno_or(const std::string &error) const
+{
+    if (errno == 0)
+    {
+        fail(error);
+    }
+    else
+    {
+        // strerror_r() exists in various versions. If you have problems getting
+        // this to compile, it's probably OK to just use:
+        //     const char *error = strerror(errno);
+        // given a) the limited amount of threading here and b) the fact this is
+        // only used to report rare errors in debug-only logging code. If push
+        // really comes to shove you can just do:
+        //     const char *error = 0;
+        // and you'll just get unhelpful error messages.
+        char buffer[1024];
+        const char *error = strerror_r(errno, buffer, sizeof(buffer));
+        if (error != 0)
+        {
+            fail(error);
+        }
+        else
+        {
+            fail("Error occurred, and strerror() probably failed as well");
+        }
+    }
+}
+
+#endif
diff --git a/Function.h b/Function.h
new file mode 100644
index 0000000..63fd6e8
--- /dev/null
+++ b/Function.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTION_H
+#define FUNCTION_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/IR/Value.h"
+
+#include "AddressSet.h"
+#include "FunctionBuilder.h"
+#include "lib6502.h"
+
+struct LLVMStuff;
+
+class Function : boost::noncopyable
+{
+public:
+    Function(M6502 *mpu, uint16_t address, const AddressSet &code_range, 
+             const AddressSet &optimistic_writes, 
+             llvm::Function *llvm_function);
+    ~Function();
+
+    uint16_t address() const
+    {
+        return address_;
+    }
+
+    const AddressSet &code_range() const
+    {
+        return code_range_;
+    }
+
+    const AddressSet &optimistic_writes() const
+    {
+        return optimistic_writes_;
+    }
+
+    void execute() const
+    {
+        FunctionBuilder::Result result = 
+            static_cast<FunctionBuilder::Result>((*jitted_function_)());
+        if (result != FunctionBuilder::result_control_transfer_direct)
+        {
+            handle_complex_result(result);
+        }
+    }
+
+    #ifdef LOG
+        void set_disassembly(const std::string &s)
+        {
+            disassembly_ = s;
+        }
+
+        void set_unoptimised_ir(const std::string &s)
+        {
+            unoptimised_ir_ = s;
+        }
+
+        void set_optimised_ir(const std::string &s)
+        {
+            optimised_ir_ = s;
+        }
+
+        std::string dump_all() const;
+
+        std::string dump_machine_code() const;
+    #endif
+
+private:
+    void handle_complex_result(FunctionBuilder::Result result) const;
+
+    #ifdef LOG
+        void fail(const std::string &error) const;
+        void fail_errno_or(const std::string &error) const;
+    #endif
+
+    M6502 *mpu_;
+    LLVMStuff &llvm_stuff_;
+    uint16_t address_;
+    AddressSet code_range_;
+    AddressSet optimistic_writes_;
+    llvm::Function *llvm_function_;
+    llvm::MachineCodeInfo mci_;
+
+    typedef int (*JitFunction)();
+    JitFunction jitted_function_;
+    
+    #ifdef LOG
+        std::string disassembly_;
+        std::string unoptimised_ir_;
+        std::string optimised_ir_;
+    #endif
+};
+
+#endif
diff --git a/FunctionBuilder.cpp b/FunctionBuilder.cpp
new file mode 100644
index 0000000..9d7efb8
--- /dev/null
+++ b/FunctionBuilder.cpp
@@ -0,0 +1,3571 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "FunctionBuilder.h"
+
+// Throughout this file we must be careful to avoid incorrect wrap-around
+// handling; for example, it's wrong to do memory[pc + 2] because if pc is
+// 0xffff this will access off the end of memory. We must always use uint16_t
+// intermediate values to get the right wrapping behaviour. Similar
+// considerations apply when using zero-page addressing; we must ensure we wrap
+// around at 0xff.
+
+#include "config.h"
+
+#include <algorithm>
+#include <assert.h>
+#include <iomanip>
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/TypeBuilder.h"
+
+#if defined HAVE_LLVM_ANALYSIS_VERIFIER_H
+    #include "llvm/Analysis/Verifier.h"
+#elif defined HAVE_LLVM_IR_VERIFIER_H
+    #include "llvm/IR/Verifier.h"
+#else
+    #error Need LLVM Verifier.h
+#endif
+
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <sstream>
+
+#include "AddressRange.h"
+#include "const.h"
+#include "Function.h"
+#include "LLVMStuff.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+
+
+namespace llvm
+{
+    template<bool xcompile> 
+    class TypeBuilder<M6502, xcompile>
+    {
+    public:
+        static StructType *get(LLVMContext &context)
+        {
+            static StructType *t = StructType::create(context, "M6502");
+            return t;
+        }
+    };
+
+    template<bool xcompile> 
+    class TypeBuilder<Registers, xcompile>
+    {
+    public:
+        static StructType *get(LLVMContext &context)
+        {
+            static StructType *t = StructType::create("Registers",
+                TypeBuilder<types::i<8>, xcompile>::get(context), // a
+                TypeBuilder<types::i<8>, xcompile>::get(context), // x
+                TypeBuilder<types::i<8>, xcompile>::get(context), // y
+                TypeBuilder<types::i<8>, xcompile>::get(context), // s
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_n
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_v
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_d
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_i
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_z
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_c
+                TypeBuilder<types::i<16>, xcompile>::get(context), // pc
+                TypeBuilder<types::i<16>, xcompile>::get(context), // addr
+                TypeBuilder<types::i<8>, xcompile>::get(context), // data
+                NULL);
+            return t;
+        }
+    };
+}
+
+namespace
+{
+    const std::string hex_prefix = "&";
+
+    bool callback_in_bounds(const M6502_Callback *callbacks, 
+                            const AddressRange &bounds)
+    {
+        for (AddressRange::const_iterator it = bounds.begin(); 
+             it != bounds.end(); ++it)
+        {
+            if (callbacks[*it] != 0)
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+}
+
+
+
+// BoundedAddress contains an llvm::Value of type i16 which refers to
+// an address in the emulated memory. It additionally contains a range of
+// possible addresses which the llvm::Value can evaluate to (derived from the
+// addressing mode which created it). This is used to optimise the generated
+// code.
+
+class FunctionBuilder::BoundedAddress
+{
+public:
+    // Construct a BoundedAddress with the widest possible bounds; this
+    // is always safe, but if possible should be avoided as it reduces
+    // optimisation potential.
+    BoundedAddress(FunctionBuilder &fb, llvm::Value *addr);
+
+    // Construct a BoundedAddress with the given bounds.
+    BoundedAddress(FunctionBuilder &fb, llvm::Value *addr, 
+                   const AddressRange &bounds);
+
+    llvm::Value *addr() const
+    {
+        return addr_;
+    }
+
+    const AddressRange &bounds() const
+    {
+        return bounds_;
+    }
+
+    friend
+    std::ostream &operator<<(std::ostream &s, const BoundedAddress &ba)
+    {
+        std::stringstream t;
+        t << "[0x" << std::hex << std::setfill('0') << std::setw(4) << 
+             ba.bounds().range_begin() << ", 0x" << std::setw(4) << 
+             ba.bounds().range_end() << ")";
+        s << t.str();
+        return s;
+    }
+
+private:
+    llvm::Value *addr_;
+    AddressRange bounds_;
+};
+
+FunctionBuilder::BoundedAddress::BoundedAddress(
+  FunctionBuilder &fb, llvm::Value *addr)
+: addr_(addr), bounds_(0, memory_size)
+{
+    assert(addr->getType() == fb.i16_type_);
+}
+
+FunctionBuilder::BoundedAddress::BoundedAddress(
+    FunctionBuilder &fb, llvm::Value *addr, const AddressRange &bounds)
+: addr_(addr), bounds_(bounds)
+{
+    assert(addr->getType() == fb.i16_type_);
+
+#ifndef NDEBUG
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(addr);
+    if (addr_ci != 0)
+    {
+        // We can verify the claimed bounds at compile time.
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        assert(addr16 == bounds.range_begin());
+        assert(addr16 == (bounds.range_end() - 1));
+    }
+    else
+    {
+        // We can't verify the claimed bounds at compile time, so generate code 
+        // to check at runtime.
+
+        llvm::BasicBlock *bounds_maybe_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_maybe_ok_block", 
+                                     fb.llvm_function_);
+        llvm::BasicBlock *bounds_not_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_not_ok");
+        llvm::BasicBlock *bounds_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_ok");
+
+        if (bounds.range_end() <= memory_size)
+        {
+            TRACE("Generating bounds check code for non-wrapped case");
+            llvm::Value *lower_bound_ok = 
+                fb.builder_.CreateICmpUGE(
+                    addr, fb.constant_u16(bounds.range_begin()));
+            fb.builder_.CreateCondBr(lower_bound_ok, bounds_maybe_ok_block, 
+                                     bounds_not_ok_block);
+            fb.builder_.SetInsertPoint(bounds_maybe_ok_block);
+            llvm::Value *upper_bound_ok = 
+                fb.builder_.CreateICmpULE(
+                    addr, fb.constant_u16(bounds.range_end() - 1));
+            fb.builder_.CreateCondBr(upper_bound_ok, bounds_ok_block, 
+                                     bounds_not_ok_block);
+        }
+        else
+        {
+            TRACE("Generating bounds check code for wrapped case");
+            llvm::Value *in_upper_range = 
+                fb.builder_.CreateICmpUGE(
+                    addr, fb.constant_u16(bounds.range_begin()));
+            fb.builder_.CreateCondBr(in_upper_range, bounds_ok_block, 
+                                     bounds_maybe_ok_block);
+            fb.builder_.SetInsertPoint(bounds_maybe_ok_block);
+            // We want to truncate bounds.range_end() - 1 to 16 bits here.
+            llvm::Value *in_lower_range = 
+                fb.builder_.CreateICmpULE(
+                    addr, fb.constant_u16(bounds.range_end() - 1));
+            fb.builder_.CreateCondBr(in_lower_range, bounds_ok_block, 
+                                     bounds_not_ok_block);
+        }
+
+        fb.llvm_function_->getBasicBlockList().push_back(bounds_not_ok_block);
+        fb.builder_.SetInsertPoint(bounds_not_ok_block);
+        fb.return_invalid_bounds();
+
+        fb.llvm_function_->getBasicBlockList().push_back(bounds_ok_block);
+        fb.builder_.SetInsertPoint(bounds_ok_block);
+    }
+#endif
+}
+
+
+
+FunctionBuilder::FunctionBuilder(
+    M6502 *mpu, const uint8_t *ct_memory, JitBool *code_at_address, 
+    uint16_t address)
+: built_(false),
+  mpu_(mpu),
+  code_at_address_(code_at_address),
+  address_(address),
+  ct_memory_(ct_memory),
+  callbacks_(*(mpu->callbacks)),
+  instructions_(0),
+  max_instructions_(std::max(1, mpu->internal->max_instructions_)),
+  context_(llvm::getGlobalContext()),
+  native_int_type_(llvm::TypeBuilder<int, false>::get(context_)),
+  callback_type_(llvm::TypeBuilder<M6502_Callback, false>::get(context_)),
+  i1_type_(llvm::TypeBuilder<llvm::types::i<1>, false>::get(context_)),
+  i8_type_(llvm::TypeBuilder<llvm::types::i<8>, false>::get(context_)),
+  i16_type_(llvm::TypeBuilder<llvm::types::i<16>, false>::get(context_)),
+  i32_type_(llvm::TypeBuilder<llvm::types::i<32>, false>::get(context_)),
+  i64_type_(llvm::TypeBuilder<llvm::types::i<64>, false>::get(context_)),
+  jit_bool_type_(llvm::TypeBuilder<JitBool, false>::get(context_)),
+  builder_(mpu_->internal->llvm_stuff_.builder_),
+  address_block_(),
+  code_generated_for_address_()
+{
+    llvm::FunctionType *ft = llvm::TypeBuilder<int(), false>::get(context_);
+    std::stringstream name;
+    name << "x" << std::hex << std::setw(4) << std::setfill('0') << address_;
+    llvm_function_ = llvm::Function::Create(
+        ft, llvm::Function::PrivateLinkage, name.str(), 
+        mpu_->internal->llvm_stuff_.module_.get());
+
+    llvm::BasicBlock *BB = 
+        llvm::BasicBlock::Create(context_, "prologue", llvm_function_);
+    builder_.SetInsertPoint(BB);
+
+    mpu_llvm_ = constant_ptr(mpu, "mpu");
+    code_at_address_llvm_ = constant_ptr(code_at_address, "code_at_address");
+    registers_ = constant_ptr(&(mpu->internal->registers_), "registers");
+    read_callbacks_ = constant_ptr(callbacks_.read, "read_callbacks");
+    write_callbacks_ = constant_ptr(callbacks_.write, "write_callbacks");
+    call_callbacks_ = constant_ptr(callbacks_.call, "call_callbacks");
+    memory_base_ = constant_ptr(mpu->memory, "memory");
+
+    function_result_ = 
+        builder_.CreateAlloca(native_int_type_, 0, "function_result");
+
+    // Function prologue: Copy the registers from Registers into local
+    // variables for use. The epilogue will reverse this process before the
+    // function returns for registers which actually get modified. (The
+    // LLVM optimiser is then able to remove loads which would just load
+    // unused values.)
+    initialise_i8_reg(a_     , 0, "a");
+    initialise_i8_reg(x_     , 1, "x");
+    initialise_i8_reg(y_     , 2, "y");
+    initialise_i8_reg(s_     , 3, "s");
+    initialise_jb_reg(flag_n_, 4, "flag_n");
+    initialise_jb_reg(flag_v_, 5, "flag_v");
+    initialise_jb_reg(flag_d_, 6, "flag_d");
+    initialise_jb_reg(flag_i_, 7, "flag_i");
+    initialise_jb_reg(flag_z_, 8, "flag_z");
+    initialise_jb_reg(flag_c_, 9, "flag_c");
+
+    pc_     = builder_.CreateAlloca(i16_type_, 0, "pc");
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, 10), false, "pc"), 
+        pc_);
+
+    // Temporary variable used when invoking read callbacks; no need to
+    // initialise.
+    read_callback_result_ = 
+        builder_.CreateAlloca(i8_type_, 0, "read_callback_result");
+
+    // Temporary variables for ADC/SBC implementation; no need to initialise.
+    p_tmp_ = builder_.CreateAlloca(i8_type_, 0, "p_tmp");
+    l_tmp_ = builder_.CreateAlloca(i8_type_, 0, "l_tmp");
+    s_tmp_ = builder_.CreateAlloca(i16_type_, 0, "s_tmp");
+    t_tmp_ = builder_.CreateAlloca(i16_type_, 0, "t_tmp");
+
+    epilogue_ = llvm::BasicBlock::Create(context_, "epilogue");
+}
+
+// The Register objects are initialised using these functions instead of
+// constructors mainly because we need a builder_ with an associated BasicBlock
+// to initialise a Register, and we don't have that when the FunctionBuilder
+// object is first constructed.
+
+void FunctionBuilder::initialise_i8_reg(
+    Register &r, int structure_index, const std::string &name)
+{
+    llvm::Value *v = builder_.CreateAlloca(i8_type_, 0, name);
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, structure_index), false, name), 
+        v);
+    r.v_ = v;
+    r.modified_ = false;
+}
+
+void FunctionBuilder::initialise_jb_reg(
+    Register &r, int structure_index, const std::string &name)
+{
+    llvm::Value *v = builder_.CreateAlloca(jit_bool_type_, 0, name);
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, structure_index), false, name), 
+        v);
+    r.v_ = v;
+    r.modified_ = false;
+}
+
+void FunctionBuilder::ensure_address_block_created(uint16_t addr)
+{
+    if (address_block_[addr] == 0)
+    {
+        std::stringstream s;
+        s << "l" << std::hex << std::setw(4) << std::setfill('0') << addr;
+        address_block_[addr] = 
+            llvm::BasicBlock::Create(context_, s.str(), llvm_function_);
+    }
+}
+
+boost::shared_ptr<Function> FunctionBuilder::build()
+{
+    // This can't be invoked twice on the same FunctionBuilder object;
+    // at present, for example, attempts to insert into 'epilogue_' crash
+    // (presumably because it's been used to generate code already). There
+    // is no reason to do this and I'm not going to convolute things to make
+    // this pointless case work. Even asserting that this doesn't happen
+    // seems like overkill, but let's do it anyway.
+    assert(!built_);
+
+    // While it doesn't strictly matter, the fact that pending_ is a std::set
+    // means it will internally sort the addresses. This makes it more likely
+    // that multiple backward jumps will only result in one stretch of code
+    // being produced, since the furthest jump backwards will be JITted first.
+    pending_.insert(address_);
+    while (!pending_.empty())
+    {
+        // We take addresses to JIT at from pending_ to start with, and when
+        // there's no "better" address...
+        uint16_t ct_pc = *(pending_.begin());
+
+        // ... but if we can continue JITting where we left off, we prefer
+        // to do that. Since each block of code emitted by build_at() is
+        // independent, this doesn't alter the behaviour of the generated
+        // code, but it avoids gratuitous discontinuities in the generated
+        // code compared with the source machine code.
+        do
+        {
+            pending_.erase(ct_pc);
+            uint16_t new_ct_pc = build_at(ct_pc);
+            if (new_ct_pc == ct_pc)
+            {
+                // build_at() did no work.
+            }
+            else if (new_ct_pc > ct_pc)
+            {
+                code_range_.insert(AddressRange(ct_pc, new_ct_pc));
+            }
+            else
+            {
+                // PC wrapped around during the translation.
+                uint32_t range_end = new_ct_pc;
+                range_end += memory_size;
+                code_range_.insert(AddressRange(ct_pc, range_end));
+            }
+            ct_pc = new_ct_pc;
+        }
+        while (pending_.find(ct_pc) != pending_.end());
+    }
+
+    LLVMStuff &llvm_stuff = mpu_->internal->llvm_stuff_;
+    llvm::FunctionPassManager fpm(llvm_stuff.module_.get());
+
+#ifdef HAVE_LLVM_DATA_LAYOUT_PASS
+    fpm.add(new llvm::DataLayoutPass(llvm_stuff.module_.get()));
+#else
+    fpm.add(
+        new llvm::DataLayout(*llvm_stuff.execution_engine_->getDataLayout()));
+#endif
+    fpm.add(llvm::createBasicAliasAnalysisPass());
+    fpm.add(llvm::createPromoteMemoryToRegisterPass());
+    fpm.add(llvm::createInstructionCombiningPass());
+    fpm.add(llvm::createReassociatePass());
+    fpm.add(llvm::createGVNPass());
+    fpm.add(llvm::createCFGSimplificationPass());
+    fpm.doInitialization();
+
+    // We could have passed llvm_function_ to BasicBlock::Create() earlier
+    // and then we wouldn't need to do this push_back() here, but doing
+    // this means the epilogue appears at the end of the IR. It makes no
+    // functional difference but it seems slightly more logical to read.
+    llvm_function_->getBasicBlockList().push_back(epilogue_);
+
+    builder_.SetInsertPoint(epilogue_);
+    if (a_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(a_.v_), 
+            builder_.CreateStructGEP(registers_, 0));
+    }
+    if (x_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(x_.v_), 
+            builder_.CreateStructGEP(registers_, 1));
+    }
+    if (y_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(y_.v_), 
+            builder_.CreateStructGEP(registers_, 2));
+    }
+    if (s_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(s_.v_), 
+            builder_.CreateStructGEP(registers_, 3));
+    }
+    if (flag_n_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_n_), 
+            builder_.CreateStructGEP(registers_, 4));
+    }
+    if (flag_v_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_v_), 
+            builder_.CreateStructGEP(registers_, 5));
+    }
+    if (flag_d_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_d_), 
+            builder_.CreateStructGEP(registers_, 6));
+    }
+    if (flag_i_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_i_), 
+            builder_.CreateStructGEP(registers_, 7));
+    }
+    if (flag_z_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_z_), 
+            builder_.CreateStructGEP(registers_, 8));
+    }
+    if (flag_c_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_c_), 
+            builder_.CreateStructGEP(registers_, 9));
+    }
+    builder_.CreateStore(
+        builder_.CreateLoad(pc_), 
+        builder_.CreateStructGEP(registers_, 10));
+
+    builder_.CreateRet(builder_.CreateLoad(function_result_));
+
+    #ifdef LOG
+        std::string unoptimised_ir;
+        {
+            llvm::raw_string_ostream s(unoptimised_ir);
+            llvm_function_->print(s);
+            s.str();
+        }
+    #endif
+    llvm::verifyFunction(*llvm_function_);
+
+    fpm.run(*llvm_function_);
+    #ifdef LOG
+        std::string optimised_ir;
+        {
+            llvm::raw_string_ostream s(optimised_ir);
+            llvm_function_->print(s);
+            s.str();
+        }
+    #endif
+
+    boost::shared_ptr<Function> f(
+        new Function(mpu_, address_, code_range_, optimistic_writes_, 
+                     llvm_function_));
+    #ifdef LOG
+        f->set_disassembly(disassembly_.str());
+        f->set_unoptimised_ir(unoptimised_ir);
+        f->set_optimised_ir(optimised_ir);
+    #endif
+
+    built_ = true;
+    return f;
+}
+
+// This translates a linear stream of 6502 instructions into LLVM IR. The
+// generation stops either when we've translated enough 6502 instructions
+// or when we hit an instruction which unconditionally transfers control
+// elsewhere. Branch targets found during the translation are added to pending_
+// for further consideration; at a minimum, address_block[] entries with
+// associated code to transfer control to those addresses must be generated
+// for each of these before terminating the build process for the function.
+//
+// The address of the first byte not translated is returned.
+uint16_t FunctionBuilder::build_at(uint16_t ct_pc)
+{
+    TRACE("Translating linear stream of instructions at 0x" << std::hex <<
+          std::setfill('0') << std::setw(4) << ct_pc);
+
+    const uint16_t original_ct_pc = ct_pc;
+    // If we already translated this stretch of code, we don't need to do
+    // anything at all.
+    if (code_generated_for_address_[ct_pc])
+    {
+        TRACE("Already translated this linear stream");
+        return ct_pc;
+    }
+
+    while (true)
+    {
+        TRACE("Translating at 0x" << std::hex << std::setfill('0') << 
+              std::setw(4) << ct_pc << ", opcode 0x" << std::setw(2) <<
+              static_cast<int>(ct_memory_[ct_pc]));
+
+        const uint16_t this_opcode_at = ct_pc;
+
+        if (code_generated_for_address_[ct_pc])
+        {
+            // We already translated this instruction, so we can stop
+            // translating and just jump there. Since this is just linear
+            // flow of control from the perspective of the 6502 code, this
+            // cannot trigger a call callback.
+            TRACE("Already translated this instruction");
+            if (builder_.GetInsertBlock()->getTerminator() == 0)
+            {
+                control_transfer_to(constant_u16(ct_pc), opcode_implicit);
+            }
+            break;
+        }
+
+        // Each instruction forms its own basic block (since we build up the
+        // IR as we go, we can't know where we might want to branch into,
+        // so we cannot merge multiple instructions into a single basic
+        // block). Basic blocks must end with a terminator, so if there isn't
+        // already a terminator at the end of the previous instruction's basic
+        // block, we insert an unconditional branch to this instruction's
+        // basic block. If there is already a terminator, we stop translating
+        // this stream of instructions unless this is the first instruction
+        // in this linear sequence; this way we avoid generating unreachable
+        // code if the previous instruction (for example) returned some kind
+        // of status code to our caller. (If the following instruction is
+        // reachable in some other way, it will be translated separately -
+        // as the first instruction in a linear sequence - because it will
+        // be present in pending.)
+        bool insert_block_has_terminator = 
+            (builder_.GetInsertBlock()->getTerminator() != 0);
+        if (insert_block_has_terminator && (ct_pc != original_ct_pc))
+        {
+            TRACE("Not translating as not first instruction in linear stream "
+                  "and previous instruction's basic block has a terminator");
+            break;
+        }
+        ensure_address_block_created(ct_pc);
+        if (!insert_block_has_terminator)
+        {
+            builder_.CreateBr(address_block_[ct_pc]);
+        }
+        builder_.SetInsertPoint(address_block_[ct_pc]);
+
+        // Note that we only set this flag for the opcode byte, not the
+        // whole length of the instruction. Apart from being easiest,
+        // this is actually correct. Someone might do LDA #<opcode for
+        // LDA #>:STA <opcode for RTS> or something weird like that and
+        // interleave instructions.
+        code_generated_for_address_[ct_pc] = true;
+
+        if (instructions_ >= max_instructions_)
+        {
+            TRACE("Translated maximum number of instructions");
+            // We must *not* use control_transfer_to() here; it would see
+            // that we have set code_generated_for_address_ and generate a
+            // branch to here, i.e. an infinite loop. It is correct that we
+            // have set code_generated_for_address_ since we must set that
+            // if we generate a corresponding address_block entry and we must
+            // do that so that any branches to this address can be resolved.
+            return_control_transfer_direct(constant_u16(ct_pc));
+            break;
+        }
+        ++instructions_;
+
+        uint8_t opcode = ct_memory_[ct_pc];
+        if (opcode == opcode_brk)
+        {
+            disassemble1(ct_pc, "BRK");
+
+            llvm::Value *new_pc_low = memory_read(abs(0xfffe));
+            llvm::Value *new_pc_high = memory_read(abs(0xffff));
+            llvm::Value *new_pc = create_u16(new_pc_low, new_pc_high);
+
+            // Because BRK pushes three bytes onto the stack, we devolve
+            // responsibility for checking for code living on the stack
+            // being modified to our caller (by returning result_brk), so
+            // we use push*raw() here. (We don't support optimistic writes;
+            // BRK isn't performance critical so there's no payoff for the
+            // extra complexity.)
+ 
+            uint16_t pc_to_stack = this_opcode_at + 2;
+            push_u16_raw(pc_to_stack);
+
+            llvm::Value *p = flag_byte();
+            p = builder_.CreateOr(p, constant_u8(flagB | flagX));
+            push_u8_raw(p);
+
+            register_store(constant_jb(jit_bool_true), flag_i_);
+            register_store(constant_jb(jit_bool_false), flag_d_);
+
+            return_brk(new_pc);
+        }
+        else if (opcode == 0x01)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, ",X)");
+            ora(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x02)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x03)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x04)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "TSB ", operand);
+            memory_op(&FunctionBuilder::tsb, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x05)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA ", operand);
+            ora(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x06)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ASL ", operand);
+            memory_op(&FunctionBuilder::asl, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x07)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x08)
+        {
+            disassemble1(ct_pc, "PHP");
+
+            llvm::Value *p = flag_byte();
+            p = builder_.CreateOr(p, constant_u8(flagB | flagX));
+            push_u8(p, ct_pc);
+        }
+        else if (opcode == 0x09)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA #", operand);
+            ora(constant_u8(operand));
+        }
+        else if (opcode == 0x0a)
+        {
+            disassemble1(ct_pc, "ASL A");
+            register_op(&FunctionBuilder::asl, a_);
+        }
+        else if (opcode == 0x0b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x0c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "TSB ", operand);
+            memory_op(&FunctionBuilder::tsb, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x0d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand);
+            ora(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x0e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ASL ", operand);
+            memory_op(&FunctionBuilder::asl, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x0f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bpl)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BPL ", target);
+            pending_.insert(target);
+            branch(flag_n_, false, target);
+        }
+        else if (opcode == 0x11)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, "),Y");
+            ora(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x12)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, ")");
+            ora(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x13)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x14)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "TRB ", operand);
+            memory_op(&FunctionBuilder::trb, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x15)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA ", operand, ",X");
+            ora(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x16)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ASL ", operand, ",X");
+            memory_op(&FunctionBuilder::asl, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x17)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x18)
+        {
+            disassemble1(ct_pc, "CLC");
+            register_store(constant_jb(jit_bool_false), flag_c_);
+        }
+        else if (opcode == 0x19)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand, ",Y");
+            ora(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x1a)
+        {
+            disassemble1(ct_pc, "INC A");
+            register_op(&FunctionBuilder::inc, a_);
+        }
+        else if (opcode == 0x1b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x1c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "TRB ", operand);
+            memory_op(&FunctionBuilder::trb, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x1d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand, ",X");
+            ora(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x1e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ASL ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::asl, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x1f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jsr)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JSR ", operand);
+            uint16_t mangled_return_addr = ct_pc - 1;
+
+            // We are pushing two bytes onto the stack here and possibly
+            // requiring our caller to handle the control transfer, so the
+            // standard mechanisms for handling writes to code and control
+            // transfer aren't enough. control_transfer_to() contains special
+            // logic for JSR and we just use push_u16_raw() here.
+            push_u16_raw(mangled_return_addr);
+
+            // We generally want to translate the subroutine code into
+            // this function, so control_transfer_to() can perform the
+            // control transfer with a simple branch. However, if there is
+            // a call callback, control_transfer_to() will have to arrange
+            // a control transfer via the generated function's caller. It
+            // would be strictly harmless for us to translate the subroutine
+            // code anyway, as it will just never be executed, but it is
+            // both pointless and makes the generated IR less readable (it
+            // has a superficially buggy appearance, since it will show a
+            // translation of possibly junk code at the callback address
+            // which may never actually execute).
+            bool is_call_callback = (callbacks_.call[operand] != 0);
+            if (!is_call_callback)
+            {
+                pending_.insert(operand);
+
+                // We can predict that the RTS in the subroutine we are
+                // about to call will return to the immediately following
+                // instruction.  (This is not guaranteed; the subroutine
+                // might fiddle with the stack. If that happens the "code"
+                // at ct_pc might be junk, but that's an acceptable risk;
+                // we will translate it but it will never be executed, and
+                // any stream of bytes can be translated even if the code
+                // is nonsense.)
+                pending_.insert(ct_pc);
+                predicted_rts_targets_[operand].insert(ct_pc);
+            }
+
+            control_transfer_to(constant_u16(operand), opcode);
+        }
+        else if (opcode == 0x21)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, ",X)");
+            And(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x22)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x23)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x24)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT ", operand);
+            bit(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x25)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND ", operand);
+            And(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x26)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROL ", operand);
+            memory_op(&FunctionBuilder::rol, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x27)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x28)
+        {
+            disassemble1(ct_pc, "PLP");
+            pop_flags();
+        }
+        else if (opcode == 0x29)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND #", operand);
+            And(constant_u8(operand));
+        }
+        else if (opcode == 0x2a)
+        {
+            disassemble1(ct_pc, "ROL A");
+            register_op(&FunctionBuilder::rol, a_);
+        }
+        else if (opcode == 0x2b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x2c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "BIT ", operand);
+            bit(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x2d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand);
+            And(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x2e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROL ", operand);
+            memory_op(&FunctionBuilder::rol, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x2f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bmi)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BMI ", target);
+            pending_.insert(target);
+            branch(flag_n_, true, target);
+        }
+        else if (opcode == 0x31)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, "),Y");
+            And(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x32)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, ")");
+            And(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x33)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x34)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT ", operand, ",X");
+            bit(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x35)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND ", operand, ",X");
+            And(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x36)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROL ", operand, ",X");
+            memory_op(&FunctionBuilder::rol, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x37)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x38)
+        {
+            disassemble1(ct_pc, "SEC");
+            register_store(constant_jb(jit_bool_true), flag_c_);
+        }
+        else if (opcode == 0x39)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand, ",Y");
+            And(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x3a)
+        {
+            disassemble1(ct_pc, "DEC A");
+            register_op(&FunctionBuilder::dec, a_);
+        }
+        else if (opcode == 0x3b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x3c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "BIT ", operand, ",X");
+            bit(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x3d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand, ",X");
+            And(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x3e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROL ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::rol, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x3f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_rti)
+        {
+            disassemble1(ct_pc, "RTI");
+            pop_flags();
+            llvm::Value *new_pc = pop_u16();
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x41)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, ",X)");
+            eor(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x42)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x43)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x44)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x45)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR ", operand);
+            eor(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x46)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LSR ", operand);
+            memory_op(&FunctionBuilder::lsr, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x47)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x48)
+        {
+            disassemble1(ct_pc, "PHA");
+            push_u8(register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x49)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR #", operand);
+            eor(constant_u8(operand));
+        }
+        else if (opcode == 0x4a)
+        {
+            disassemble1(ct_pc, "LSR A");
+            register_op(&FunctionBuilder::lsr, a_);
+        }
+        else if (opcode == 0x4b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP ", operand);
+            pending_.insert(operand);
+            control_transfer_to(constant_u16(operand), opcode);
+        }
+        else if (opcode == 0x4d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand);
+            eor(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x4e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LSR ", operand);
+            memory_op(&FunctionBuilder::lsr, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x4f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bvc)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BVC ", target);
+            pending_.insert(target);
+            branch(flag_v_, false, target);
+        }
+        else if (opcode == 0x51)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, "),Y");
+            eor(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x52)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, ")");
+            eor(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x53)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x54)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x55)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR ", operand, ",X");
+            eor(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x56)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LSR ", operand, ",X");
+            memory_op(&FunctionBuilder::lsr, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x57)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x58)
+        {
+            disassemble1(ct_pc, "CLI");
+            register_store(constant_jb(jit_bool_false), flag_i_);
+        }
+        else if (opcode == 0x59)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand, ",Y");
+            eor(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x5a)
+        {
+            disassemble1(ct_pc, "PHY");
+            push_u8(register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x5b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x5c)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0x5d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand, ",X");
+            eor(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x5e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LSR ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::lsr, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x5f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_rts)
+        {
+            disassemble1(ct_pc, "RTS");
+            llvm::Value *new_pc = check_predicted_rts(original_ct_pc);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x61)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, ",X)");
+            adc(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x62)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x63)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x64)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STZ ", operand);
+            memory_write(zp(operand), constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x65)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC ", operand);
+            adc(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x66)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROR ", operand);
+            memory_op(&FunctionBuilder::ror, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x67)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x68)
+        {
+            disassemble1(ct_pc, "PLA");
+            llvm::Value *data = pop_u8();
+            register_store(data, a_);
+            set_nz(data);
+        }
+        else if (opcode == 0x69)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC #", operand);
+            adc(constant_u8(operand));
+        }
+        else if (opcode == 0x6a)
+        {
+            disassemble1(ct_pc, "ROR A");
+            register_op(&FunctionBuilder::ror, a_);
+        }
+        else if (opcode == 0x6b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_ind_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP (", operand, ")");
+            llvm::Value *low_byte = memory_read_untrapped(abs(operand));
+            // We're emulating the 65C02 here so we don't wrap if operand
+            // is of the form &xxFF. (Unless xx is FF, of course.)
+            uint16_t high_byte_at = operand + 1;
+            llvm::Value *high_byte = memory_read_untrapped(abs(high_byte_at));
+            llvm::Value *new_pc = create_u16(low_byte, high_byte);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x6d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand);
+            adc(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x6e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROR ", operand);
+            memory_op(&FunctionBuilder::ror, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x6f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bvs)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BVS ", target);
+            pending_.insert(target);
+            branch(flag_v_, true, target);
+        }
+        else if (opcode == 0x71)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, "),Y");
+            adc(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x72)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, ")");
+            adc(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x73)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x74)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STZ ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x75)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC ", operand, ",X");
+            adc(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x76)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROR ", operand, ",X");
+            memory_op(&FunctionBuilder::ror, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x77)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x78)
+        {
+            disassemble1(ct_pc, "SEI");
+            register_store(constant_jb(jit_bool_true), flag_i_);
+        }
+        else if (opcode == 0x79)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand, ",Y");
+            adc(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x7a)
+        {
+            disassemble1(ct_pc, "PLY");
+            llvm::Value *data = pop_u8();
+            register_store(data, y_);
+            set_nz(data);
+        }
+        else if (opcode == 0x7b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_indx_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP (", operand, ",X)");
+            llvm::Value *low_byte_at = 
+                builder_.CreateAdd(
+                    constant_u16(operand), 
+                    zext_i16(register_load(x_)));
+            llvm::Value *high_byte_at = 
+                builder_.CreateAdd(low_byte_at, constant_u16(1));
+            llvm::Value *low_byte = 
+                memory_read_untrapped(BoundedAddress(*this, low_byte_at));
+            llvm::Value *high_byte = 
+                memory_read_untrapped(BoundedAddress(*this, high_byte_at));
+            llvm::Value *new_pc = create_u16(low_byte, high_byte);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x7d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand, ",X");
+            adc(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x7e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROR ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::ror, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x7f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bra)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BRA ", target);
+            pending_.insert(target);
+            control_transfer_to(constant_u16(target), opcode);
+        }
+        else if (opcode == 0x81)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, ",X)");
+            memory_write(zp_pre_index(constant_u8(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x82)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x83)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x84)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STY ", operand);
+            memory_write(zp(operand), register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x85)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA ", operand);
+            memory_write(zp(operand), register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x86)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STX ", operand);
+            memory_write(zp(operand), register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x87)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x88)
+        {
+            disassemble1(ct_pc, "DEY");
+            register_op(&FunctionBuilder::dec, y_);
+        }
+        else if (opcode == 0x89)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT #", operand);
+            // Note that unlike other BIT opcodes, this one only affects
+            // the Z flag.
+            llvm::Value *tmp = 
+                builder_.CreateAnd(register_load(a_), constant_u8(operand));
+            set_z(tmp);
+        }
+        else if (opcode == 0x8a)
+        {
+            disassemble1(ct_pc, "TXA");
+            transfer(x_, a_);
+        }
+        else if (opcode == 0x8b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x8c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STY ", operand);
+            memory_write(abs(operand), register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x8d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand);
+            memory_write(abs(operand), register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x8e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STX ", operand);
+            memory_write(abs(operand), register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x8f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bcc)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BCC ", target);
+            pending_.insert(target);
+            branch(flag_c_, false, target);
+        }
+        else if (opcode == 0x91)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, "),Y");
+            memory_write(zp_post_index(constant_u8(operand), register_load(y_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x92)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, ")");
+            memory_write(zp_post_index(constant_u8(operand), constant_u8(0)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x93)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x94)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STY ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x95)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x96)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STX ", operand, ",Y");
+            memory_write(zp_index(constant_u8(operand), register_load(y_)), 
+                         register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x97)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x98)
+        {
+            disassemble1(ct_pc, "TYA");
+            transfer(y_, a_);
+        }
+        else if (opcode == 0x99)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand, ",Y");
+            memory_write(abs_index(constant_u16(operand), register_load(y_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x9a)
+        {
+            disassemble1(ct_pc, "TXS");
+            // We don't use transfer() even though we do for TSX; TXS doesn't
+            // set any flags.
+            register_store(register_load(x_), s_);
+        }
+        else if (opcode == 0x9b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x9c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STZ ", operand);
+            memory_write(abs(operand), constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x9d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand, ",X");
+            memory_write(abs_index(constant_u16(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x9e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STZ ", operand, ",X");
+            memory_write(abs_index(constant_u16(operand), register_load(x_)), 
+                         constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x9f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY #", operand);
+            ld(y_, constant_u8(operand));
+        }
+        else if (opcode == 0xa1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, ",X)");
+            ld(a_, memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xa2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX #", operand);
+            ld(x_, constant_u8(operand));
+        }
+        else if (opcode == 0xa3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY ", operand);
+            ld(y_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA ", operand);
+            ld(a_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX ", operand);
+            ld(x_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa8)
+        {
+            disassemble1(ct_pc, "TAY");
+            transfer(a_, y_);
+        }
+        else if (opcode == 0xa9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA #", operand);
+            ld(a_, constant_u8(operand));
+        }
+        else if (opcode == 0xaa)
+        {
+            disassemble1(ct_pc, "TAX");
+            transfer(a_, x_);
+        }
+        else if (opcode == 0xab)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xac)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDY ", operand);
+            ld(y_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xad)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand);
+            ld(a_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xae)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDX ", operand);
+            ld(x_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xaf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bcs)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BCS ", target);
+            pending_.insert(target);
+            branch(flag_c_, true, target);
+        }
+        else if (opcode == 0xb1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, "),Y");
+            ld(a_, memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xb2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, ")");
+            ld(a_, memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0xb3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xb4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY ", operand, ",X");
+            ld(y_, memory_read(
+                zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xb5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA ", operand, ",X");
+            ld(a_, memory_read(
+                zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xb6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX ", operand, ",Y");
+            ld(x_, memory_read(
+                zp_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xb7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xb8)
+        {
+            disassemble1(ct_pc, "CLV");
+            register_store(constant_jb(jit_bool_false), flag_v_);
+        }
+        else if (opcode == 0xb9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand, ",Y");
+            ld(a_, memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xba)
+        {
+            disassemble1(ct_pc, "TSX");
+            transfer(s_, x_);
+        }
+        else if (opcode == 0xbb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xbc)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDY ", operand, ",X");
+            ld(y_, memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xbd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand, ",X");
+            ld(a_, memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xbe)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDX ", operand, ",Y");
+            ld(x_, memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xbf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPY #", operand);
+            cmp(register_load(y_), constant_u8(operand));
+        }
+        else if (opcode == 0xc1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, ",X)");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xc2)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xc3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPY ", operand);
+            cmp(register_load(y_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xc5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP ", operand);
+            cmp(register_load(a_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xc6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "DEC ", operand);
+            memory_op(&FunctionBuilder::dec, zp(operand), ct_pc);
+        }
+        else if (opcode == 0xc7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc8)
+        {
+            disassemble1(ct_pc, "INY");
+            register_op(&FunctionBuilder::inc, y_);
+        }
+        else if (opcode == 0xc9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP #", operand);
+            cmp(register_load(a_), constant_u8(operand));
+        }
+        else if (opcode == 0xca)
+        {
+            disassemble1(ct_pc, "DEX");
+            register_op(&FunctionBuilder::dec, x_);
+        }
+        else if (opcode == 0xcb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xcc)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CPY ", operand);
+            cmp(register_load(y_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xcd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand);
+            cmp(register_load(a_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xce)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "DEC ", operand);
+            memory_op(&FunctionBuilder::dec, abs(operand), ct_pc);
+        }
+        else if (opcode == 0xcf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bne)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BNE ", target);
+            pending_.insert(target);
+            branch(flag_z_, false, target);
+        }
+        else if (opcode == 0xd1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, "),Y");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xd2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, ")");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_post_index(constant_u8(operand), constant_u8(0))));
+        } 
+        else if (opcode == 0xd3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xd4)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xd5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP ", operand, ",X");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xd6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "DEC ", operand, ",X");
+            memory_op(&FunctionBuilder::dec, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0xd7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xd8)
+        {
+            disassemble1(ct_pc, "CLD");
+            register_store(constant_jb(jit_bool_false), flag_d_);
+        }
+        else if (opcode == 0xd9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand, ",Y");
+            cmp(register_load(a_), 
+                memory_read(
+                    abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xda)
+        {
+            disassemble1(ct_pc, "PHX");
+            push_u8(register_load(x_), ct_pc);
+        }
+        else if (opcode == 0xdb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xdc)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0xdd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand, ",X");
+            cmp(register_load(a_), 
+                memory_read(
+                    abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xde)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "DEC ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::dec, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0xdf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPX #", operand);
+            cmp(register_load(x_), constant_u8(operand));
+        }
+        else if (opcode == 0xe1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, ",X)");
+            sbc(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xe2)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xe3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPX ", operand);
+            cmp(register_load(x_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xe5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC ", operand);
+            sbc(memory_read(zp(operand)));
+        }
+        else if (opcode == 0xe6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "INC ", operand);
+            memory_op(&FunctionBuilder::inc, zp(operand), ct_pc);
+        }
+        else if (opcode == 0xe7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe8)
+        {
+            disassemble1(ct_pc, "INX");
+            register_op(&FunctionBuilder::inc, x_);
+        }
+        else if (opcode == 0xe9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC #", operand);
+            sbc(constant_u8(operand));
+        }
+        else if (opcode == 0xea)
+        {
+            disassemble1(ct_pc, "NOP");
+        }
+        else if (opcode == 0xeb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xec)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CPX ", operand);
+            cmp(register_load(x_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xed)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand);
+            sbc(memory_read(abs(operand)));
+        }
+        else if (opcode == 0xee)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "INC ", operand);
+            memory_op(&FunctionBuilder::inc, abs(operand), ct_pc);
+        }
+        else if (opcode == 0xef)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_beq)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BEQ ", target);
+            pending_.insert(target);
+            branch(flag_z_, true, target);
+        }
+        else if (opcode == 0xf1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, "),Y");
+            sbc(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xf2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, ")");
+            sbc(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0xf3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xf4)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xf5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC ", operand, ",X");
+            sbc(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xf6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "INC ", operand, ",X");
+            memory_op(&FunctionBuilder::inc, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0xf7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xf8)
+        {
+            disassemble1(ct_pc, "SED");
+            register_store(constant_jb(jit_bool_true), flag_d_);
+        }
+        else if (opcode == 0xf9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand, ",Y");
+            sbc(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xfa)
+        {
+            disassemble1(ct_pc, "PLX");
+            llvm::Value *data = pop_u8();
+            register_store(data, x_);
+            set_nz(data);
+        }
+        else if (opcode == 0xfb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xfc)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0xfd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand, ",X");
+            sbc(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xfe)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "INC ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::inc, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0xff)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else
+        {
+            CANT_HAPPEN("Unknown opcode 0x" << std::hex << opcode);
+        }
+    }
+
+    return ct_pc;
+}
+
+// Return the 8-bit operand of the instruction whose opcode is located at
+// the given address.
+uint8_t FunctionBuilder::operand8(uint16_t opcode_at)
+{
+    uint16_t addr = opcode_at;
+    return ct_memory_[++addr];
+}
+
+// Return the 16-bit operand of the instruction whose opcode is located at
+// the given address.
+uint16_t FunctionBuilder::operand16(uint16_t opcode_at)
+{
+    uint16_t addr = opcode_at;
+    uint8_t operand_low = ct_memory_[++addr];
+    uint8_t operand_high = ct_memory_[++addr];
+    return operand_low | (operand_high << 8);
+} 
+
+llvm::Value *FunctionBuilder::constant_i1(bool c)
+{
+    return llvm::ConstantInt::get(i1_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u8(uint8_t c)
+{
+    return llvm::ConstantInt::get(i8_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u16(uint16_t c)
+{
+    return llvm::ConstantInt::get(i16_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u32(uint32_t c)
+{
+    return llvm::ConstantInt::get(i32_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u64(uint64_t c)
+{
+    return llvm::ConstantInt::get(i64_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_i(int c)
+{
+    return llvm::ConstantInt::get(native_int_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_jb(JitBool c)
+{
+    return llvm::ConstantInt::get(jit_bool_type_, c);
+}
+
+llvm::Value *FunctionBuilder::convert_i1_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i1_type_);
+    return builder_.CreateZExt(v, jit_bool_type_);
+}
+
+llvm::Value *FunctionBuilder::convert_i8_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i8_type_);
+    return v;
+}
+
+llvm::Value *FunctionBuilder::convert_i16_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i16_type_);
+    return convert_i1_to_jb(builder_.CreateICmpNE(v, constant_u16(0)));
+}
+
+// JitBool values should be tested via jit_bool_is_*() and not directly;
+// this is because they use a 0=false, non-0=true representation. It's not
+// correct to assume they are either 0 or 1.
+
+llvm::Value *FunctionBuilder::jit_bool_is_true(llvm::Value *v)
+{
+    assert(v->getType() == jit_bool_type_);
+    return builder_.CreateICmpNE(v, constant_u8(0));
+}
+
+llvm::Value *FunctionBuilder::jit_bool_is_false(llvm::Value *v)
+{
+    assert(v->getType() == jit_bool_type_);
+    return builder_.CreateICmpEQ(v, constant_u8(0));
+}
+
+llvm::Value *FunctionBuilder::convert_i1_to_i8(llvm::Value *v)
+{
+    assert(v->getType() == i1_type_);
+    return builder_.CreateZExt(v, i8_type_);
+}
+
+llvm::Value *FunctionBuilder::zext_i16(llvm::Value *v)
+{
+    return builder_.CreateZExt(v, i16_type_);
+}
+
+llvm::Value *FunctionBuilder::zext_i32(llvm::Value *v)
+{
+    return builder_.CreateZExt(v, i32_type_);
+}
+
+llvm::Value *FunctionBuilder::sext_i16(llvm::Value *v)
+{
+    return builder_.CreateSExt(v, i16_type_);
+}
+
+llvm::Value *FunctionBuilder::trunc_i8(llvm::Value *v)
+{
+    return builder_.CreateTrunc(v, i8_type_);
+}
+
+llvm::Value *FunctionBuilder::create_u16(
+    llvm::Value *low_byte, llvm::Value *high_byte)
+{
+    return builder_.CreateOr(
+        zext_i16(low_byte), 
+        builder_.CreateShl(zext_i16(high_byte), 8));
+}
+
+llvm::Value *FunctionBuilder::register_load(const Register &r)
+{
+    return builder_.CreateLoad(r.v_);
+}
+
+void FunctionBuilder::register_store(llvm::Value *v, Register &r)
+{
+    builder_.CreateStore(v, r.v_);
+    r.modified_ = true;
+}
+
+void FunctionBuilder::register_op(OpFn op, Register &r)
+{
+    llvm::Value *data = register_load(r);
+    data = (this->*op)(data);
+    register_store(data, r);
+}
+
+void FunctionBuilder::memory_op(
+    OpFn op, const BoundedAddress &ba, uint16_t next_opcode_at)
+{
+    llvm::Value *data = memory_read(ba);
+    data = (this->*op)(data);
+    memory_write(ba, data, next_opcode_at);
+}
+
+void FunctionBuilder::adc(llvm::Value *data)
+{
+    llvm::BasicBlock *done_adc_block = 
+        llvm::BasicBlock::Create(context_, "done_adc");
+    llvm::BasicBlock *adc_binary_block = 
+        llvm::BasicBlock::Create(context_, "adc_binary", llvm_function_);
+    llvm::BasicBlock *adc_decimal_block = 
+        llvm::BasicBlock::Create(context_, "adc_decimal", llvm_function_);
+    llvm::Value *d_clear = jit_bool_is_false(register_load(flag_d_));
+    builder_.CreateCondBr(d_clear, adc_binary_block, adc_decimal_block);
+    llvm_function_->getBasicBlockList().push_back(done_adc_block);
+    builder_.SetInsertPoint(adc_binary_block);
+    adc_binary(data);
+    builder_.CreateBr(done_adc_block);
+    builder_.SetInsertPoint(adc_decimal_block);
+    adc_decimal(data);
+    builder_.CreateBr(done_adc_block);
+    builder_.SetInsertPoint(done_adc_block);
+}
+
+void FunctionBuilder::adc_binary(llvm::Value *data)
+{
+    llvm::Value *carry_16 = zext_i16(jit_bool_is_true(register_load(flag_c_)));
+
+    llvm::Value *a_u16 = zext_i16(register_load(a_));
+    llvm::Value *data_u16 = zext_i16(data);
+    llvm::Value *sum_u16 = 
+        builder_.CreateAdd(builder_.CreateAdd(a_u16, data_u16), carry_16);
+
+    llvm::Value *a_s16 = builder_.CreateSExt(register_load(a_), i16_type_);
+    llvm::Value *data_s16 = builder_.CreateSExt(data, i16_type_);
+    llvm::Value *sum_s16 = 
+        builder_.CreateAdd(builder_.CreateAdd(a_s16, data_s16), carry_16);
+
+    llvm::Value *new_a = trunc_i8(sum_u16);
+    register_store(new_a, a_);
+    set_nz(new_a);
+
+    llvm::Value *b8 = builder_.CreateAnd(
+        sum_u16, 
+        constant_u16(0x100));
+    register_store(convert_i16_to_jb(b8), flag_c_);
+
+    llvm::Value *negative_as_unsigned = 
+        jit_bool_is_true(register_load(flag_n_));
+    llvm::Value *negative_as_signed = 
+        builder_.CreateICmpSLT(sum_s16, constant_u16(0));
+    llvm::Value *new_v_as_i1 =
+        builder_.CreateXor(negative_as_unsigned, negative_as_signed);
+    register_store(convert_i1_to_jb(new_v_as_i1), flag_v_);
+}
+
+void FunctionBuilder::adc_decimal(llvm::Value *data)
+{
+    // This algorithm taken from http://www.6502.org/tutorials/decimal_mode.html
+
+    llvm::Value *carry = jit_bool_is_true(register_load(flag_c_));
+
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                builder_.CreateAnd(
+                    register_load(a_),
+                    constant_u8(0x0f)),
+                builder_.CreateAnd(
+                    data,
+                    constant_u8(0x0f))),
+            convert_i1_to_i8(carry)),
+        l_tmp_);
+
+    llvm::BasicBlock *adjust_l_block = 
+        llvm::BasicBlock::Create(context_, "adjust_l", llvm_function_);
+    llvm::BasicBlock *l_done_block = 
+        llvm::BasicBlock::Create(context_, "l_done", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpUGE(
+            builder_.CreateLoad(l_tmp_), 
+            constant_u8(0x0a)),
+        adjust_l_block, l_done_block);
+
+    builder_.SetInsertPoint(adjust_l_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAnd(
+                builder_.CreateAdd(
+                    builder_.CreateLoad(l_tmp_),
+                    constant_u8(0x06)),
+                constant_u8(0x0f)),
+            constant_u8(0x10)),
+        l_tmp_);
+    builder_.CreateBr(l_done_block);
+
+    builder_.SetInsertPoint(l_done_block);
+
+    llvm::Value *a_and_0xf0 =
+        builder_.CreateAnd(
+            register_load(a_),
+            constant_u8(0xf0));
+    llvm::Value *data_and_0xf0 =
+        builder_.CreateAnd(
+            data,
+            constant_u8(0xf0));
+
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                zext_i16(a_and_0xf0),
+                zext_i16(data_and_0xf0)),
+            zext_i16(builder_.CreateLoad(l_tmp_))),
+        s_tmp_);    
+
+    llvm::BasicBlock *adjust_s_block = 
+        llvm::BasicBlock::Create(context_, "adjust_s", llvm_function_);
+    llvm::BasicBlock *s_done_block = 
+        llvm::BasicBlock::Create(context_, "s_done", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpUGE(
+            builder_.CreateLoad(s_tmp_), 
+            constant_u16(0xa0)),
+        adjust_s_block, s_done_block);
+
+    builder_.SetInsertPoint(adjust_s_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x60)),
+        s_tmp_);
+    builder_.CreateBr(s_done_block);
+
+    builder_.SetInsertPoint(s_done_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                sext_i16(a_and_0xf0),
+                sext_i16(data_and_0xf0)),
+            zext_i16(builder_.CreateLoad(l_tmp_))),
+        t_tmp_);
+
+    llvm::BasicBlock *v_not_done_block = 
+        llvm::BasicBlock::Create(context_, "v_not_done", llvm_function_);
+    llvm::BasicBlock *v_false_block = 
+        llvm::BasicBlock::Create(context_, "v_false", llvm_function_);
+    llvm::BasicBlock *v_done_block = 
+        llvm::BasicBlock::Create(context_, "v_done", llvm_function_);
+    register_store(constant_jb(jit_bool_true), flag_v_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(t_tmp_), 
+            constant_u16(-128)),
+        v_done_block, v_not_done_block);
+    builder_.SetInsertPoint(v_not_done_block);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSGT(
+            builder_.CreateLoad(t_tmp_), 
+            constant_u16(127)),
+        v_done_block, v_false_block);
+    builder_.SetInsertPoint(v_false_block);
+    register_store(constant_jb(jit_bool_false), flag_v_);
+    builder_.CreateBr(v_done_block);
+    builder_.SetInsertPoint(v_done_block);
+
+    register_store(trunc_i8(builder_.CreateLoad(s_tmp_)), a_);
+    set_nz(register_load(a_));
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpUGE(
+                builder_.CreateLoad(s_tmp_),
+                constant_u16(0x100))),
+        flag_c_);
+}
+
+void FunctionBuilder::And(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateAnd(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+llvm::Value *FunctionBuilder::asl(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_c_);
+    llvm::Value *result = builder_.CreateShl(data, 1);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::bit(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_n_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x40))), flag_v_);
+    llvm::Value *tmp = builder_.CreateAnd(register_load(a_), data);
+    set_z(tmp);
+}
+
+void FunctionBuilder::branch(Register &flag, bool branch_if, uint16_t target)
+{
+    llvm::BasicBlock *not_taken_block = 
+        llvm::BasicBlock::Create(context_, "branch_not_taken", llvm_function_);
+    ensure_address_block_created(target);
+    llvm::Value *flag_set = jit_bool_is_true(register_load(flag));
+    if (branch_if)
+    {
+        builder_.CreateCondBr(flag_set, address_block_[target], 
+                              not_taken_block);
+    }
+    else
+    {
+        builder_.CreateCondBr(flag_set, not_taken_block, 
+                              address_block_[target]);
+    }
+    builder_.SetInsertPoint(not_taken_block);
+}
+
+void FunctionBuilder::cmp(llvm::Value *r, llvm::Value *data)
+{
+    llvm::Value *sum = builder_.CreateSub(r, data);
+    set_nz(sum);
+    register_store(convert_i1_to_jb(builder_.CreateICmpUGE(r, data)), flag_c_);
+}
+
+llvm::Value *FunctionBuilder::dec(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateSub(data, constant_u8(1));
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::eor(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateXor(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+llvm::Value *FunctionBuilder::inc(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateAdd(data, constant_u8(1));
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::ld(Register &r, llvm::Value *data)
+{
+    register_store(data, r);
+    set_nz(data);
+}
+
+llvm::Value *FunctionBuilder::lsr(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x1))), flag_c_);
+    llvm::Value *result = builder_.CreateLShr(data, 1);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::ora(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateOr(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+void FunctionBuilder::pop_flags()
+{
+    llvm::Value *p = pop_u8();
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagN))), flag_n_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagV))), flag_v_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagD))), flag_d_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagI))), flag_i_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagZ))), flag_z_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagC))), flag_c_);
+}
+
+llvm::Value *FunctionBuilder::pop_u8()
+{
+    llvm::Value *new_s = builder_.CreateAdd(register_load(s_), constant_u8(1));
+    register_store(new_s, s_);
+    return memory_read_untrapped(abs_index(constant_u16(stack), new_s));
+}
+
+
+llvm::Value *FunctionBuilder::pop_u16()
+{
+    llvm::Value *low_byte = pop_u8();
+    llvm::Value *high_byte = pop_u8();
+    return create_u16(low_byte, high_byte);
+}
+
+void FunctionBuilder::push_u8_raw(llvm::Value *data)
+{
+    memory_write_raw(abs_index(constant_u16(stack), register_load(s_)), data);
+    register_store(builder_.CreateSub(register_load(s_), constant_u8(1)), s_);
+}
+
+void FunctionBuilder::push_u16_raw(uint16_t u)
+{
+    uint8_t high_byte = u >> 8;
+    uint8_t low_byte = u & 0xff;
+    push_u8_raw(constant_u8(high_byte));
+    push_u8_raw(constant_u8(low_byte));
+}
+
+// Push the given value onto the stack.
+//
+// Note that because the push may invalidate code living on the stack,
+// this may generate intructions which return control to the caller to
+// deal with that, so within a given opcode being translated, no further
+// code-generating functions should be called after this.
+void FunctionBuilder::push_u8(llvm::Value *data, uint16_t next_opcode_at)
+{
+    llvm::Value *old_s = register_load(s_);
+    const BoundedAddress &ba = abs_index(constant_u16(stack), old_s);
+    register_store(builder_.CreateSub(old_s, constant_u8(1)), s_);
+    memory_write_untrapped(ba, data, next_opcode_at);
+}
+
+llvm::Value *FunctionBuilder::rol(llvm::Value *data)
+{
+    llvm::Value *new_low_bit = 
+        convert_i1_to_i8(jit_bool_is_true(register_load(flag_c_)));
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_c_);
+    llvm::Value *result = 
+        builder_.CreateOr(builder_.CreateShl(data, 1), new_low_bit);
+    set_nz(result);
+    return result;
+}
+
+llvm::Value *FunctionBuilder::ror(llvm::Value *data)
+{
+    llvm::Value *c_as_bit = 
+        convert_i1_to_i8(jit_bool_is_true(register_load(flag_c_)));
+    llvm::Value *new_high_bit = builder_.CreateShl(c_as_bit, 7);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x1))), flag_c_);
+    llvm::Value *result = 
+        builder_.CreateOr(builder_.CreateLShr(data, 1), new_high_bit);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::sbc(llvm::Value *data)
+{
+    llvm::BasicBlock *done_sbc_block = 
+        llvm::BasicBlock::Create(context_, "done_sbc");
+    llvm::BasicBlock *sbc_binary_block = 
+        llvm::BasicBlock::Create(context_, "sbc_binary", llvm_function_);
+    llvm::BasicBlock *sbc_decimal_block = 
+        llvm::BasicBlock::Create(context_, "sbc_decimal", llvm_function_);
+    llvm::Value *d_clear = jit_bool_is_false(register_load(flag_d_));
+    builder_.CreateCondBr(d_clear, sbc_binary_block, sbc_decimal_block);
+    llvm_function_->getBasicBlockList().push_back(done_sbc_block);
+    builder_.SetInsertPoint(sbc_binary_block);
+    sbc_binary(data);
+    builder_.CreateBr(done_sbc_block);
+    builder_.SetInsertPoint(sbc_decimal_block);
+    sbc_decimal(data);
+    builder_.CreateBr(done_sbc_block);
+    builder_.SetInsertPoint(done_sbc_block);
+}
+
+void FunctionBuilder::sbc_binary(llvm::Value *data)
+{
+    llvm::Value *borrow_16 = 
+        zext_i16(jit_bool_is_false(register_load(flag_c_)));
+
+    sbc_overflow(data, borrow_16); // must do this before storing new value to a
+
+    llvm::Value *a_u16 = zext_i16(register_load(a_));
+    llvm::Value *data_u16 = zext_i16(data);
+    llvm::Value *result_u16 = 
+        builder_.CreateSub(builder_.CreateSub(a_u16, data_u16), borrow_16);
+
+    llvm::Value *new_a = trunc_i8(result_u16);
+    register_store(new_a, a_);
+    set_nz(new_a);
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpEQ(
+                builder_.CreateAnd(result_u16, constant_u16(0x100)),
+                constant_u16(0))),
+        flag_c_);
+}
+
+void FunctionBuilder::sbc_decimal(llvm::Value *data)
+{
+    llvm::Value *borrow = jit_bool_is_false(register_load(flag_c_));
+    llvm::Value *borrow_16 = zext_i16(borrow);
+
+    sbc_overflow(data, borrow_16); // must do this before modifying a
+
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateSub(
+                builder_.CreateAnd(
+                    register_load(a_),
+                    constant_u8(0x0f)),
+                builder_.CreateAnd(
+                    data,
+                    constant_u8(0x0f))),
+            convert_i1_to_i8(borrow)),
+        l_tmp_);
+
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateSub(
+                zext_i16(register_load(a_)),
+                zext_i16(data)),
+            borrow_16),
+        s_tmp_);
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpEQ(
+                builder_.CreateAnd(
+                    builder_.CreateLoad(s_tmp_),
+                    constant_u16(0x100)),
+                constant_u16(0))),
+        flag_c_);
+
+    llvm::BasicBlock *s_adjust1_block = 
+        llvm::BasicBlock::Create(context_, "s_adjust1", llvm_function_);
+    llvm::BasicBlock *done_s_adjust1_block = 
+        llvm::BasicBlock::Create(context_, "done_s_adjust1", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0)),
+        s_adjust1_block,
+        done_s_adjust1_block);
+
+    builder_.SetInsertPoint(s_adjust1_block);
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x60)),
+        s_tmp_);
+    builder_.CreateBr(done_s_adjust1_block);
+
+    builder_.SetInsertPoint(done_s_adjust1_block);
+
+    llvm::BasicBlock *s_adjust2_block = 
+        llvm::BasicBlock::Create(context_, "s_adjust2", llvm_function_);
+    llvm::BasicBlock *done_s_adjust2_block = 
+        llvm::BasicBlock::Create(context_, "done_s_adjust2", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(l_tmp_),
+            constant_u8(0)),
+        s_adjust2_block,
+        done_s_adjust2_block);
+
+    builder_.SetInsertPoint(s_adjust2_block);
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x06)),
+        s_tmp_);
+    builder_.CreateBr(done_s_adjust2_block);
+
+    builder_.SetInsertPoint(done_s_adjust2_block);
+    register_store(trunc_i8(builder_.CreateLoad(s_tmp_)), a_);
+    set_nz(register_load(a_));
+}
+
+void FunctionBuilder::sbc_overflow(
+    llvm::Value *data, llvm::Value *borrow_16)
+{
+    llvm::Value *a_s16 = sext_i16(register_load(a_));
+    llvm::Value *data_s16 = sext_i16(data);
+    llvm::Value *result_s16 = 
+        builder_.CreateSub(builder_.CreateSub(a_s16, data_s16), borrow_16);
+
+    llvm::Value *negative_as_unsigned = 
+        builder_.CreateICmpNE(
+            builder_.CreateAnd(result_s16, constant_u16(0x80)),
+            constant_u16(0));
+    llvm::Value *negative_as_signed =
+        builder_.CreateICmpSLT(result_s16, constant_u16(0));
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateXor(negative_as_unsigned, negative_as_signed)),
+        flag_v_);
+}
+
+void FunctionBuilder::transfer(
+    const Register &from, Register &to)
+{
+    llvm::Value *data = builder_.CreateLoad(from.v_);
+    register_store(data, to);
+    set_nz(data);
+}
+
+llvm::Value *FunctionBuilder::trb(llvm::Value *data)
+{
+    set_z(builder_.CreateAnd(data, register_load(a_)));
+
+    llvm::Value *result =
+        builder_.CreateAnd(
+            data,
+            builder_.CreateXor(
+                register_load(a_),
+                constant_u8(0xff)));
+    return result;
+}
+
+llvm::Value *FunctionBuilder::tsb(llvm::Value *data)
+{
+    set_z(builder_.CreateAnd(data, register_load(a_)));
+
+    llvm::Value *result =
+        builder_.CreateOr(
+            data,
+            register_load(a_));
+    return result;
+}
+
+void FunctionBuilder::set_nz(llvm::Value *data)
+{
+    register_store(convert_i8_to_jb(builder_.CreateAnd(data, 0x80)), flag_n_);
+    set_z(data);
+}
+
+void FunctionBuilder::set_z(llvm::Value *data)
+{
+    register_store(
+        convert_i1_to_jb(builder_.CreateICmpEQ(data, constant_u8(0))), flag_z_);
+}
+
+llvm::Value *FunctionBuilder::flag_byte()
+{
+    builder_.CreateStore(constant_u8(0), p_tmp_);
+
+    flag_byte_bit(flag_n_, flagN);
+    flag_byte_bit(flag_v_, flagV);
+    flag_byte_bit(flag_d_, flagD);
+    flag_byte_bit(flag_i_, flagI);
+    flag_byte_bit(flag_z_, flagZ);
+    flag_byte_bit(flag_c_, flagC);
+
+    return builder_.CreateLoad(p_tmp_);
+}
+
+void FunctionBuilder::flag_byte_bit(const Register &flag_reg, uint8_t flag_bit)
+{
+    llvm::BasicBlock *bit_set_block = 
+        llvm::BasicBlock::Create(context_, "bit_set", llvm_function_);
+    llvm::BasicBlock *bit_done_block = 
+        llvm::BasicBlock::Create(context_, "bit_done", llvm_function_);
+    llvm::Value *bit_set = jit_bool_is_true(register_load(flag_reg));
+    builder_.CreateCondBr(bit_set, bit_set_block, bit_done_block);
+
+    builder_.SetInsertPoint(bit_set_block);
+    builder_.CreateStore(
+        builder_.CreateOr(builder_.CreateLoad(p_tmp_), flag_bit), p_tmp_);
+    builder_.CreateBr(bit_done_block);
+
+    builder_.SetInsertPoint(bit_done_block);
+}
+
+void FunctionBuilder::illegal_instruction(uint16_t &ct_pc, int bytes)
+{
+    uint16_t opcode_at = ct_pc;
+    uint8_t opcode = ct_memory_[opcode_at];
+
+    std::stringstream s;
+    s << "illegal " << hex_prefix << std::hex << std::setw(2) << 
+         std::setfill('0') << static_cast<int>(opcode) << " ";
+    switch (bytes)
+    {
+        case 1:
+            disassemble1(ct_pc, s.str());
+            break;
+
+        case 2:
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, s.str(), operand);
+            break;
+        }
+
+        case 3:
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, s.str(), operand);
+            break;
+        }
+
+        default:
+            CANT_HAPPEN("Invalid byte count (ct_pc 0x" << std::hex << ct_pc << 
+                        ", " << std::dec << "bytes " << bytes << ")");
+    }
+
+    if (callbacks_.illegal_instruction[opcode] != 0)
+    {
+        return_illegal_instruction(ct_pc, opcode_at, opcode);
+    }
+    else
+    {
+        // Illegal instructions are defined on the 65C02 to be no-ops.
+    }
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp(uint8_t addr)
+{
+    // We still generate a u16 for the actual llvm::Value. It probably doesn't
+    // make any difference but it seems logical as memory address "are" 16 bits,
+    // even if 8-bit ones are handled more efficiently on a real 6502.
+    return BoundedAddress(*this, constant_u16(addr), AddressRange(addr));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::abs(uint16_t addr)
+{
+    return BoundedAddress(*this, constant_u16(addr), AddressRange(addr));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::abs_index(
+    llvm::Value *abs, llvm::Value *index)
+{
+    assert(abs->getType() == i16_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::ConstantInt *abs_ci = llvm::cast<llvm::ConstantInt>(abs);
+    uint16_t range_begin = abs_ci->getLimitedValue();
+    uint32_t range_end = range_begin;
+    range_end += 0x100;
+
+    return BoundedAddress(*this, builder_.CreateAdd(abs, zext_i16(index)), 
+                          AddressRange(range_begin, range_end));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    return BoundedAddress(*this, zext_i16(builder_.CreateAdd(zp, index)), 
+                          AddressRange(0, 0x100));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_post_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::Value *low_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(zp)));
+    llvm::Value *high_byte_at = builder_.CreateAdd(zp, constant_u8(1));
+    llvm::Value *high_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(high_byte_at)));
+    llvm::Value *base_addr = create_u16(low_byte, high_byte);
+    return BoundedAddress(*this, 
+                          builder_.CreateAdd(base_addr, zext_i16(index)));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_pre_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::Value *low_byte_at = builder_.CreateAdd(zp, index);
+    llvm::Value *high_byte_at = builder_.CreateAdd(low_byte_at, constant_u8(1));
+    llvm::Value *low_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(low_byte_at)));
+    llvm::Value *high_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(high_byte_at)));
+    return BoundedAddress(*this, create_u16(low_byte, high_byte));
+}
+
+llvm::Value *FunctionBuilder::check_predicted_rts(uint16_t subroutine_addr)
+{
+    llvm::Value *mangled_pc = pop_u16();
+    llvm::Value *new_pc = builder_.CreateAdd(mangled_pc, constant_u16(1));
+
+    // It would be correct to just return new_pc at this point; our caller
+    // will use it to arrange a control transfer. Since that is a run-time
+    // determined value, the control transfer would have to be done by
+    // returning from the generated function. We may be able to make some
+    // plausible guesses (currently never guaranteed to be correct) which
+    // we can verify at run time and which if correct allow the RTS to be
+    // handled as a branch within the generated function. This should save
+    // a bit of overhead on not returning from the function and re-entering
+    // another and may also allow the optimiser some additional leeway.
+
+    const AddressSet &targets = predicted_rts_targets_[subroutine_addr];
+    TRACE("Generating predicted RTS code; " << targets.size() << " target(s)");
+    for (AddressSet::const_iterator it = targets.begin(); it != targets.end(); 
+         ++it)
+    {
+        const uint16_t target = *it;
+        llvm::BasicBlock *prediction_correct = 
+            llvm::BasicBlock::Create(context_, "prediction_correct", 
+                                     llvm_function_);
+        llvm::BasicBlock *prediction_incorrect = 
+            llvm::BasicBlock::Create(context_, "prediction_incorrect", 
+                                     llvm_function_);
+        builder_.CreateCondBr(
+            builder_.CreateICmpEQ(constant_u16(target), new_pc), 
+            prediction_correct, prediction_incorrect);
+        builder_.SetInsertPoint(prediction_correct);
+        control_transfer_to(constant_u16(target), opcode_rts);
+        builder_.SetInsertPoint(prediction_incorrect);
+    }
+
+    return new_pc;
+}
+
+void FunctionBuilder::control_transfer_to(llvm::Value *target, uint8_t opcode)
+{
+    assert(target->getType() == i16_type_);
+
+    switch (opcode)
+    {
+        case opcode_rts:
+        case opcode_rti:
+        case opcode_bra:
+        case opcode_bcc:
+        case opcode_bcs:
+        case opcode_bvc:
+        case opcode_bvs:
+        case opcode_beq:
+        case opcode_bne:
+        case opcode_bmi:
+        case opcode_bpl:
+        case opcode_implicit:
+            // This control transfer never triggers a call callback.
+            break;
+
+        case opcode_jsr:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is known at compile time.
+            llvm::ConstantInt *target_ci = 
+                llvm::cast<llvm::ConstantInt>(target);
+            uint16_t target16 = target_ci->getLimitedValue();
+            if (callbacks_.call[target16] != 0)
+            {
+                return_jsr_complex(target);
+                return;
+            }
+
+            // We also need to check if the two bytes pushed onto the stack by
+            // the JSR have invalidated any JITted code and return control to
+            // our caller if so.
+            //
+            // Note that we work with a tmp_s i8 local so that if the stack
+            // pointer wrapped during the JSR pushes we will still work
+            // correctly here.
+            llvm::Value *tmp_s = 
+                builder_.CreateAdd(register_load(s_), constant_u8(1));
+            llvm::Value *stack_addr1 = 
+                builder_.CreateAdd(constant_u16(stack), zext_i16(tmp_s));
+            tmp_s = builder_.CreateAdd(tmp_s, constant_u8(1));
+            llvm::Value *stack_addr2 = 
+                builder_.CreateAdd(constant_u16(stack), zext_i16(tmp_s));
+
+            llvm::BasicBlock *code_not_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_not_modified");
+            llvm::BasicBlock *code_addr1_not_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_addr1_not_modified", 
+                                         llvm_function_);
+            llvm::BasicBlock *code_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_modified", 
+                                         llvm_function_);
+
+            const AddressRange stack_range(stack, stack + 0x100);
+            llvm::Value *stack_addr1_is_code = 
+                is_code_at(BoundedAddress(*this, stack_addr1, stack_range));
+            builder_.CreateCondBr(stack_addr1_is_code, code_modified_block, 
+                                  code_addr1_not_modified_block);
+
+            builder_.SetInsertPoint(code_addr1_not_modified_block);
+            llvm::Value *stack_addr2_is_code = 
+                is_code_at(BoundedAddress(*this, stack_addr2, stack_range));
+            builder_.CreateCondBr(stack_addr2_is_code, code_modified_block, 
+                                  code_not_modified_block);
+
+            builder_.SetInsertPoint(code_modified_block);
+            return_jsr_complex(target);
+
+            llvm_function_->getBasicBlockList().push_back(
+                code_not_modified_block);
+            builder_.SetInsertPoint(code_not_modified_block);
+            break;
+        }
+
+        case opcode_jmp_abs:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is known at compile time.
+            llvm::ConstantInt *target_ci = 
+                llvm::cast<llvm::ConstantInt>(target);
+            uint16_t target16 = target_ci->getLimitedValue();
+            if (callbacks_.call[target16] != 0)
+            {
+                return_control_transfer_indirect(target, opcode);
+                return;
+            }
+            break;
+        }
+
+        case opcode_jmp_ind_abs:
+        case opcode_jmp_indx_abs:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is only known at run time.
+            assert(!llvm::isa<llvm::ConstantInt>(target));
+            llvm::Value *call_callback_addr = builder_.CreateGEP(
+                call_callbacks_, 
+                llvm::ArrayRef<llvm::Value *>(zext_i32(target)));
+            llvm::Value *call_callback = 
+                builder_.CreateLoad(call_callback_addr);
+            llvm::BasicBlock *call_callback_block = 
+                llvm::BasicBlock::Create(context_, "call_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_call_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_call_callback", 
+                                         llvm_function_);
+            llvm::Value *call_callback_not_null = 
+                builder_.CreateIsNotNull(call_callback);
+            builder_.CreateCondBr(call_callback_not_null, call_callback_block, 
+                                  no_call_callback_block);
+
+            builder_.SetInsertPoint(call_callback_block);
+            return_control_transfer_indirect(target, opcode);
+
+            builder_.SetInsertPoint(no_call_callback_block);
+            break;
+        }
+    
+        default:
+            CANT_HAPPEN("Unexpected opcode 0x" << std::hex << opcode);
+    }
+
+    llvm::ConstantInt *target_ci = llvm::dyn_cast<llvm::ConstantInt>(target);
+    if ((target_ci != 0) && (
+            code_generated_for_address_[target_ci->getLimitedValue()] ||
+            (pending_.find(target_ci->getLimitedValue()) != pending_.end())))
+    {
+        ensure_address_block_created(target_ci->getLimitedValue());
+        // The target is within this function, so we can just branch there.
+        builder_.CreateBr(address_block_[target_ci->getLimitedValue()]);
+    }
+    else
+    {
+        // The target isn't (knowably) within this function, so we have to
+        // get there via our caller.
+        return_control_transfer_direct(target);
+    }
+}
+
+// All memory reads should be done via a call to this function, unless they are
+// explicitly exempt from read callbacks.
+llvm::Value *FunctionBuilder::memory_read(const BoundedAddress &ba)
+{
+    llvm::Value *addr = ba.addr();
+
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(addr);
+    if (addr_ci != 0)
+    {
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        TRACE("Load at compile-time constant address 0x" << std::hex << 
+              std::setfill('0') << std::setw(4) << addr16);
+        if (callbacks_.read[addr16] != 0)
+        {
+            TRACE("Read callback exists at constant address");
+            llvm::Value *callback = 
+                constant_ptr(callbacks_.read[addr16], "read_callback");
+            return call_read_callback(callback, addr);
+        }
+    
+        // Actually do the read from memory.
+        return memory_read_untrapped(ba);
+    }
+    else
+    {
+        if (callback_in_bounds(callbacks_.read, ba.bounds()))
+        {
+            TRACE("Read callback may exist; runtime check required");
+            llvm::Value *read_callback_addr = builder_.CreateGEP(
+                read_callbacks_, llvm::ArrayRef<llvm::Value *>(zext_i32(addr)));
+            llvm::Value *read_callback = 
+                builder_.CreateLoad(read_callback_addr);
+            llvm::BasicBlock *read_callback_block = 
+                llvm::BasicBlock::Create(context_, "read_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_read_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_read_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *memory_read_done_block = 
+                llvm::BasicBlock::Create(context_, "memory_read_done");
+            llvm::Value *read_callback_not_null = 
+                builder_.CreateIsNotNull(read_callback);
+            builder_.CreateCondBr(read_callback_not_null, read_callback_block, 
+                                  no_read_callback_block);
+
+            builder_.SetInsertPoint(read_callback_block);
+            llvm::Value *result = call_read_callback(read_callback, ba.addr());
+            builder_.CreateStore(result, read_callback_result_);
+            builder_.CreateBr(memory_read_done_block);
+
+            builder_.SetInsertPoint(no_read_callback_block);
+            builder_.CreateStore(memory_read_untrapped(ba), 
+                                 read_callback_result_);
+            builder_.CreateBr(memory_read_done_block);
+            
+            llvm_function_->getBasicBlockList().push_back(
+                memory_read_done_block);
+            builder_.SetInsertPoint(memory_read_done_block);
+            return builder_.CreateLoad(read_callback_result_);
+        }
+        else
+        {
+            TRACE("No read callback within address bounds");
+            // Actually do the read from memory.
+            return memory_read_untrapped(ba);
+        }
+    }
+}
+
+llvm::Value *FunctionBuilder::memory_read_untrapped(const BoundedAddress &ba)
+{
+    llvm::Value *host_addr = builder_.CreateGEP(
+        memory_base_, llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+    return builder_.CreateLoad(host_addr);
+}
+
+// All memory writes should be done via a call to this function, unless they
+// are explicitly exempt from triggering write callbacks.
+//
+// Note that because this may return to the caller to indicate
+// result_write_to_code or result_write_callback, it must be the last
+// code-generation function called when translating an opcode, as any
+// subsequent code may not be executed.
+void FunctionBuilder::memory_write(const BoundedAddress &ba,
+                                 llvm::Value *data, uint16_t next_opcode_at)
+{
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(ba.addr());
+    if (addr_ci != 0)
+    {
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        TRACE("Store at compile-time constant address 0x" << std::hex << 
+              std::setfill('0') << std::setw(4) << addr16);
+        if (callbacks_.write[addr16] != 0)
+        {
+            TRACE("Write callback exists at constant address");
+            return_write_callback(next_opcode_at, ba.addr(), data);
+            return;
+        }
+    }
+    else
+    {
+        if (callback_in_bounds(callbacks_.write, ba.bounds()))
+        {
+            TRACE("Write callback may exist; runtime check required");
+            llvm::Value *write_callback_addr = builder_.CreateGEP(
+                write_callbacks_, 
+                llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+            llvm::Value *write_callback = 
+                builder_.CreateLoad(write_callback_addr);
+            llvm::BasicBlock *write_callback_block = 
+                llvm::BasicBlock::Create(context_, "write_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_write_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_write_callback", 
+                                         llvm_function_);
+            llvm::Value *write_callback_not_null = 
+                builder_.CreateIsNotNull(write_callback);
+            builder_.CreateCondBr(write_callback_not_null, write_callback_block, 
+                                  no_write_callback_block);
+
+            builder_.SetInsertPoint(write_callback_block);
+            return_write_callback(next_opcode_at, ba.addr(), data);
+
+            builder_.SetInsertPoint(no_write_callback_block);
+        }
+        else
+        {
+            TRACE("No write callback within address bounds");
+        }
+    }
+
+    memory_write_untrapped(ba, data, next_opcode_at);
+}
+
+// Note that (like lib6502 proper) we don't externalise our registers before
+// invoking the (read/write) callback or internalise them afterwards, so
+// the callback doesn't see correct information if it examines the CPU state.
+llvm::Value *FunctionBuilder::call_callback(
+    llvm::Value *callback, llvm::Value *addr, 
+    llvm::Value *data)
+{
+    return builder_.CreateCall3(callback, mpu_llvm_, addr, data, 
+                                "callback_result");
+}
+
+llvm::Value *FunctionBuilder::call_read_callback(
+    llvm::Value *callback, llvm::Value *addr)
+{
+    llvm::Value *result_int = call_callback(callback, addr, constant_u8(0));
+    return builder_.CreateTrunc(result_int, i8_type_);
+}
+
+// Write to memory with no checks for modification of already JITted code or
+// write callbacks.
+void FunctionBuilder::memory_write_raw(const BoundedAddress &ba,
+                                     llvm::Value *data)
+{
+    llvm::Value *host_addr = builder_.CreateGEP(
+        memory_base_, llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+    builder_.CreateStore(data, host_addr);
+}
+
+llvm::Value *FunctionBuilder::is_code_at(const BoundedAddress &ba)
+{
+    const AddressRange &bounds = ba.bounds();
+    bool use_optimistic_write = !bounds.all_memory();
+    for (AddressRange::const_iterator it = bounds.begin(); 
+         use_optimistic_write && (it != bounds.end()); ++it)
+    {
+        uint16_t i = *it;
+        if (code_at_address_[i])
+        {
+            TRACE("BoundedAddress " << ba << 
+                  " includes known code at 0x" << std::hex << 
+                  std::setfill('0') << std::setw(4) << i << 
+                  "; can't use optimistic write");
+            use_optimistic_write = false;
+        }
+    }
+    
+    if (use_optimistic_write)
+    {
+        optimistic_writes_.insert(ba.bounds());
+        return constant_i1(false);
+    }
+    else
+    {
+        llvm::Value *code_at_address_flag_addr = builder_.CreateGEP(
+            code_at_address_llvm_, 
+            llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+        return jit_bool_is_true(builder_.CreateLoad(code_at_address_flag_addr));
+    }
+}
+
+// Write to memory, checking for modification of already JITted code but
+// not for write callbacks.
+//
+// Note that because this may return to the caller to indicate
+// result_write_to_code, it must be the last code-generation function called
+// when translating an opcode, as any subsequent code may not be executed.
+void FunctionBuilder::memory_write_untrapped(
+    const BoundedAddress &ba, llvm::Value *data, 
+    uint16_t next_opcode_at)
+{
+    // Actually do the write.
+    memory_write_raw(ba, data);
+
+    // Check for writes which modify JITted code.
+    llvm::Value *just_modified_code = is_code_at(ba);
+
+    // The optimiser would eliminate the dead branches if just_modified_code
+    // is a constant false value, but to make the IR easier to read and perhaps
+    // help the optimiser out, let's not generate pointless code in this case.
+    llvm::ConstantInt *just_modified_ci = 
+        llvm::dyn_cast<llvm::ConstantInt>(just_modified_code);
+    if ((just_modified_ci != 0) && !(just_modified_ci->getLimitedValue()))
+    {
+        return;
+    }
+
+    llvm::BasicBlock *code_modified_block = 
+        llvm::BasicBlock::Create(context_, "code_modified", llvm_function_);
+    llvm::BasicBlock *code_not_modified_block = 
+        llvm::BasicBlock::Create(context_, "code_not_modified", llvm_function_);
+    builder_.CreateCondBr(just_modified_code, code_modified_block, 
+                          code_not_modified_block);
+
+    builder_.SetInsertPoint(code_modified_block);
+    return_write_to_code(next_opcode_at, ba.addr());
+
+    builder_.SetInsertPoint(code_not_modified_block);
+}
+
+void FunctionBuilder::return_pc(Result result, llvm::Value *new_pc)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_addr(Result result, llvm::Value *new_pc, 
+                                     llvm::Value *addr)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(addr, builder_.CreateStructGEP(registers_, 11));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_data(Result result, llvm::Value *new_pc, 
+                                     llvm::Value *data)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(data, builder_.CreateStructGEP(registers_, 12));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_addr_data(
+    Result result, llvm::Value *new_pc, llvm::Value *addr, llvm::Value *data)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(addr, builder_.CreateStructGEP(registers_, 11));
+    builder_.CreateStore(data, builder_.CreateStructGEP(registers_, 12));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_control_transfer_direct(llvm::Value *new_pc)
+{
+    return_pc(result_control_transfer_direct, new_pc);
+}
+
+void FunctionBuilder::return_control_transfer_indirect(
+    llvm::Value *new_pc, uint8_t opcode)
+{
+    return_pc_data(result_control_transfer_indirect, new_pc, 
+                   constant_u8(opcode));
+}
+
+void FunctionBuilder::return_brk(llvm::Value *new_pc)
+{
+    return_pc(result_brk, new_pc);
+}
+
+void FunctionBuilder::return_jsr_complex(llvm::Value *new_pc)
+{
+    return_pc(result_jsr_complex, new_pc);
+}
+
+void FunctionBuilder::return_illegal_instruction(
+    uint16_t new_pc, uint16_t opcode_at, uint8_t opcode)
+{
+    return_pc_addr_data(result_illegal_instruction, constant_u16(new_pc), 
+                        constant_u16(opcode_at), constant_u8(opcode));
+}
+
+void FunctionBuilder::return_write_to_code(uint16_t new_pc, llvm::Value *addr)
+{
+    return_pc_addr(result_write_to_code, constant_u16(new_pc), addr);
+}
+
+void FunctionBuilder::return_write_callback(
+    uint16_t new_pc, llvm::Value *addr, llvm::Value *data)
+{
+    return_pc_addr_data(
+        result_write_callback, constant_u16(new_pc), addr, data);
+}
+
+void FunctionBuilder::return_invalid_bounds()
+{
+    builder_.CreateStore(constant_i(result_invalid_bounds), function_result_);
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::disassemble1(uint16_t &addr, const std::string &s)
+{
+    disassemble_hex_dump(addr, 1);
+    disassembly_ << s << "\n";
+    ++addr;
+}
+
+void FunctionBuilder::disassemble2(
+    uint16_t &addr, const std::string &prefix, uint8_t &operand, 
+    const std::string &suffix)
+{
+    disassemble_hex_dump(addr, 2);
+    operand = operand8(addr);
+    disassembly_ << prefix << hex_prefix << std::setw(2) << 
+                    static_cast<int>(operand) << suffix;
+
+    // This is a bit of a special case, but it works so...
+    std::string::size_type l = prefix.length();
+    if ((l > 1) && (prefix[l - 1] == '#') && isprint(operand))
+    {
+        disassembly_ << " ('" << static_cast<char>(operand) << "')";
+    }
+
+    disassembly_ << "\n";
+
+    addr += 2;
+}
+
+void FunctionBuilder::disassemble3(
+    uint16_t &addr, const std::string &prefix, uint16_t &operand, 
+    const std::string &suffix)
+{
+    disassemble_hex_dump(addr, 3);
+    operand = operand16(addr);
+    disassembly_ << prefix << hex_prefix << std::setw(4) << operand << suffix << 
+                    "\n";
+    addr += 3;
+}
+
+void FunctionBuilder::disassemble_branch(
+    uint16_t &addr, const std::string &s, uint16_t &target)
+{
+    disassemble_hex_dump(addr, 2);
+    uint8_t operand = operand8(addr);
+    int offset = (operand < 0x80) ? operand : -(0x100 - operand);
+    // The branch is relative to the PC *after* it's been moved past the
+    // branch instruction.
+    addr += 2;
+    target = addr + offset;
+    disassembly_ << s << hex_prefix << std::setw(4) << target << "\n";
+}
+
+void FunctionBuilder::disassemble_hex_dump(uint16_t addr, int bytes)
+{ 
+    assert(bytes <= 3);
+    disassembly_ << std::hex << std::setw(4) << std::setfill('0') << addr << 
+                    " ";
+    for (int i = 0; i < 3; ++i)
+    {
+        if (i < bytes)
+        {
+            disassembly_ << std::setw(2) << 
+                            static_cast<int>(ct_memory_[addr + i]) << " ";
+        }
+        else
+        {
+            disassembly_ << "   ";
+        }
+    }
+}
diff --git a/FunctionBuilder.h b/FunctionBuilder.h
new file mode 100644
index 0000000..da2df8d
--- /dev/null
+++ b/FunctionBuilder.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTIONBUILDER_H
+#define FUNCTIONBUILDER_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/Value.h"
+#include <map>
+#include <set>
+#include <sstream>
+
+#include "AddressSet.h"
+#include "const.h"
+#include "JitBool.h"
+#include "lib6502.h"
+
+class Function;
+struct LLVMStuff;
+
+class FunctionBuilder : boost::noncopyable
+{
+public:
+    // Create a FunctionBuilder object which can be used to build a Function
+    // representing the code starting at 'address'. The Function object built
+    // will operate on the given M6502 object. The 'code_at_address' array
+    // will be used at compile time and at runtime to decide if writes to
+    // memory may invalidate already JITted code. The memory inside the M6502
+    // object will be used when the Funtion object executes, but ct_memory
+    // will be used at compile time to determine the instructions to compile;
+    // see FunctionManager for more on this.
+    FunctionBuilder(M6502 *mpu, const uint8_t *ct_memory, 
+                    JitBool *code_at_address, uint16_t address);
+
+    boost::shared_ptr<Function> build();
+
+    // Status codes returned by the JITted function
+    enum Result
+    {
+        // Control has transferred to the address in registers.pc. No call
+        // callback should be invoked, either because the JITted function knows
+        // there is no applicable call callback or because the control transfer
+        // is via an instruction which does not trigger call callbacks.
+        result_control_transfer_direct,
+
+        // Control has transferred to the address in registers.pc via an
+        // instruction which is eligible for call callbacks. registers.data
+        // contains the opcode of the instruction which transferred
+        // control. The caller should check for an applicable call
+        // callback. registers.addr is *not* updated; the addr value for
+        // the callback is registers.pc.
+        result_control_transfer_indirect,
+
+        // A BRK instruction has just been executed and registers.pc updated
+        // to point to the BRK vector. The caller should check to see if the
+        // stack pushes implicitly performed by BRK have invalidated any
+        // already-JITted code and for a call callback on the BRK vector.
+        // Neither registers.addr nor registers.data are updated.
+        result_brk,
+
+        // A JSR instruction has just been executed and registers.pc
+        // updated to point to the destination address. One or both of the
+        // following may be true: - the stack pushes implicitly performed
+        // have invalidated some
+        //   already-JITted code
+        // - a call callback is registered on the destination address It is not
+        // guaranteed that either of these is the case, although in practice
+        // with this implementation at least one should be true. Not all JSR
+        // instructions will necessarily cause the JITted function to return
+        // this value, hence the result code is result_jsr_*complex* not just
+        // result_jsr. Neither registers.addr nor registers.data are updated.
+        result_jsr_complex,
+
+        // An illegal instruction has been executed and registers.pc updated to
+        // point to the following opcode. registers.addr contains the address
+        // of the illegal instruction and registers.data its opcode. The
+        // caller should check to see if a callback is registered.
+        result_illegal_instruction,
+
+        // A memory write has been executed which changed an address marked
+        // as holding code. registers.addr contains the address modified. The
+        // caller should invalidate any JITted functions for this address.
+        result_write_to_code,
+
+        // A memory write has occurred which triggers a write callback. Memory
+        // has not been updated. registers.addr and registers.data contain the
+        // address and the data being written respectively. The caller should
+        // invoke the write callback and check for writes to already-JITted
+        // code.
+        result_write_callback,
+
+        // Internal bounds generated for an instruction's address range were
+        // found to be invalid by self-checking code. This can only occur
+        // in debug builds and then only if there is a bug in FunctionBuilder.
+        result_invalid_bounds
+    };
+
+private:
+    uint16_t build_at(uint16_t ct_pc);
+
+    uint8_t operand8(uint16_t opcode_at);
+    uint16_t operand16(uint16_t opcode_at);
+
+    llvm::Value *constant_i1(bool c);
+    llvm::Value *constant_u8(uint8_t c);
+    llvm::Value *constant_u16(uint16_t c);
+    llvm::Value *constant_u32(uint32_t c);
+    llvm::Value *constant_u64(uint64_t c);
+
+    template <class T>
+    llvm::Value *constant_ptr(T *p, const std::string &name)
+    {
+        llvm::Value *v = constant_u64(reinterpret_cast<unsigned long>(p));
+        // The name passed in never seems to be used, but maybe this will
+        // change in the future. It doesn't really do us any harm to pass
+        // it in anyway.
+        return builder_.CreateIntToPtr(
+            v, llvm::TypeBuilder<T *, false>::get(llvm::getGlobalContext()), 
+            name);
+    }
+
+    llvm::Value *constant_i(int c);
+
+    llvm::Value *constant_jb(JitBool c);
+    llvm::Value *convert_i1_to_jb(llvm::Value *v);
+    llvm::Value *convert_i8_to_jb(llvm::Value *v);
+    llvm::Value *convert_i16_to_jb(llvm::Value *v);
+    llvm::Value *jit_bool_is_true(llvm::Value *v);
+    llvm::Value *jit_bool_is_false(llvm::Value *v);
+
+    llvm::Value *convert_i1_to_i8(llvm::Value *v);
+
+    llvm::Value *zext_i16(llvm::Value *v);
+    llvm::Value *zext_i32(llvm::Value *v);
+    llvm::Value *sext_i16(llvm::Value *v);
+    llvm::Value *trunc_i8(llvm::Value *v);
+    llvm::Value *create_u16(llvm::Value *low_byte, llvm::Value *high_byte);
+
+    struct Register
+    {
+        llvm::Value *v_;
+        bool modified_;
+    };
+    void initialise_i8_reg(Register &r, int structure_index, 
+                           const std::string &name);
+    void initialise_jb_reg(Register &r, int structure_index, 
+                           const std::string &name);
+
+    void ensure_address_block_created(uint16_t addr);
+
+    void return_pc(Result result, llvm::Value *new_pc);
+    void return_pc_addr(Result result, llvm::Value *new_pc, llvm::Value *addr);
+    void return_pc_data(Result result, llvm::Value *new_pc, llvm::Value *data);
+    void return_pc_addr_data(Result result, llvm::Value *new_pc, 
+                             llvm::Value *addr, llvm::Value *data);
+    void return_control_transfer_direct(llvm::Value *new_pc);
+    void return_control_transfer_indirect(llvm::Value *new_pc, uint8_t opcode);
+    void return_brk(llvm::Value *new_pc);
+    void return_jsr_complex(llvm::Value *new_pc);
+    void return_illegal_instruction(uint16_t new_pc, uint16_t opcode_at, 
+                                    uint8_t opcode);
+    void return_write_to_code(uint16_t new_pc, llvm::Value *addr);
+    void return_write_callback(uint16_t new_pc, llvm::Value *addr, 
+                               llvm::Value *data);
+    void return_invalid_bounds();
+
+    class BoundedAddress;
+
+    llvm::Value *register_load(const Register &r);
+    void register_store(llvm::Value *v, Register &r);
+
+    typedef llvm::Value *(FunctionBuilder::*OpFn)(llvm::Value *data);
+    void register_op(OpFn op, Register &r);
+    void memory_op(OpFn op, const BoundedAddress &ba, uint16_t next_opcode_at);
+
+    llvm::Value *is_code_at(const BoundedAddress &addr);
+
+    void adc(llvm::Value *data);
+    void adc_llvm(llvm::Value *data);
+    void adc_binary(llvm::Value *data);
+    void adc_decimal(llvm::Value *data);
+    void adc_binary_llvm(llvm::Value *data);
+    void adc_decimal_llvm(llvm::Value *data);
+    void And(llvm::Value *data);
+    llvm::Value *asl(llvm::Value *data);
+    void bit(llvm::Value *data);
+    void branch(Register &flag, bool branch_if, uint16_t target);
+    void cmp(llvm::Value *r, llvm::Value *data);
+    void cmp_llvm(llvm::Value *r, llvm::Value *data);
+    llvm::Value *dec(llvm::Value *data);
+    void eor(llvm::Value *data);
+    llvm::Value *inc(llvm::Value *data);
+    void ld(Register &r, llvm::Value *data);
+    llvm::Value *lsr(llvm::Value *data);
+    void ora(llvm::Value *data);
+    void pop_flags();
+    llvm::Value *pop_u8();
+    llvm::Value *pop_u16();
+    void push_u8_raw(llvm::Value *data);
+    void push_u16_raw(uint16_t u);
+    void push_u8(llvm::Value *data, uint16_t next_opcode_at);
+    llvm::Value *rol(llvm::Value *data);
+    llvm::Value *ror(llvm::Value *data);
+    void sbc(llvm::Value *data);
+    void sbc_binary(llvm::Value *data);
+    void sbc_decimal(llvm::Value *data);
+    void sbc_overflow(llvm::Value *data, 
+                      llvm::Value *borrow);
+    void transfer(const Register &from, Register &to);
+    llvm::Value *trb(llvm::Value *data);
+    llvm::Value *tsb(llvm::Value *data);
+
+    void set_nz(llvm::Value *data);
+    void set_z(llvm::Value *data);
+
+    llvm::Value *flag_byte();
+    void flag_byte_bit(const Register &flag_reg, uint8_t flag_bit);
+
+    void illegal_instruction(uint16_t &ct_pc, int bytes);
+
+    BoundedAddress zp(uint8_t addr);
+    BoundedAddress abs(uint16_t addr);
+    BoundedAddress abs_index(llvm::Value *abs, 
+                           llvm::Value *index);
+    BoundedAddress zp_index(llvm::Value *zp, 
+                             llvm::Value *r);
+    BoundedAddress zp_post_index(
+        llvm::Value *zp, llvm::Value *index);
+    BoundedAddress zp_pre_index(
+        llvm::Value *zp, llvm::Value *index);
+
+    llvm::Value *check_predicted_rts(uint16_t subroutine_addr);
+
+    // A special opcode used as the third argument to control_transfer_to
+    // when there is no explicit opcode causing the control transfer; this
+    // is just a documented way to signal that the control transfer is direct
+    // and cannot trigger a call callback.
+    enum {
+        opcode_implicit = 0xff
+    };
+    void control_transfer_to(llvm::Value *target, uint8_t opcode);
+
+    llvm::Value *memory_read(const BoundedAddress &ba);
+    llvm::Value *memory_read_untrapped(const BoundedAddress &ba);
+
+    void memory_write(const BoundedAddress &ba,
+                           llvm::Value *data, uint16_t next_opcode_at);
+    void memory_write_untrapped(const BoundedAddress &ba,
+                                llvm::Value *data, uint16_t next_opcode_at);
+    void memory_write_raw(const BoundedAddress &ba,
+                               llvm::Value *data);
+
+    llvm::Value *call_callback(
+        llvm::Value *callback, llvm::Value *addr, 
+        llvm::Value *data);
+    llvm::Value *call_read_callback(
+        llvm::Value *callback, llvm::Value *addr);
+
+    void disassemble1(uint16_t &addr, const std::string &s);
+    void disassemble2(uint16_t &addr, const std::string &prefix, 
+                      uint8_t &operand, const std::string &suffix = "");
+    void disassemble3(uint16_t &addr, const std::string &prefix, 
+                      uint16_t &operand, const std::string &suffix = "");
+    void disassemble_branch(uint16_t &addr, const std::string &s, 
+                            uint16_t &target);
+    void disassemble_hex_dump(uint16_t addr, int bytes);
+
+    bool built_;
+
+    M6502 *const mpu_;
+    JitBool *code_at_address_;
+    const uint16_t address_;
+    const uint8_t *const ct_memory_;
+    // callbacks_ is strictly redundant as it's available inside mpu, but
+    // it's convenient.
+    const M6502_Callbacks &callbacks_;
+
+    AddressSet code_range_;
+    AddressSet optimistic_writes_;
+
+    std::stringstream disassembly_;
+
+    int instructions_;
+    const int max_instructions_;
+
+    // This could be an AddressSet but since we "rely" on the order of
+    // iteration for pending_ it seems better to be explicit; we don't need
+    // any of the range-handling convenience of AddressSet here anyway.
+    std::set<uint16_t> pending_;
+
+    std::map<uint16_t, AddressSet> predicted_rts_targets_;
+
+    llvm::LLVMContext &context_;
+
+    llvm::Type *const native_int_type_;
+    llvm::PointerType *const callback_type_;
+    llvm::Type *const i1_type_;
+    llvm::Type *const i8_type_;
+    llvm::Type *const i16_type_;
+    llvm::Type *const i32_type_;
+    llvm::Type *const i64_type_;
+    llvm::Type *const jit_bool_type_;
+
+    llvm::IRBuilder<> &builder_;
+
+    llvm::Function *llvm_function_;
+
+    llvm::Value *registers_;
+    llvm::Value *code_at_address_llvm_;
+    llvm::Value *read_callbacks_;
+    llvm::Value *write_callbacks_;
+    llvm::Value *call_callbacks_;
+    llvm::Value *memory_base_;
+    llvm::Value *mpu_llvm_;
+
+    llvm::Value *function_result_;
+
+    // Note that address_block_ and code_generated_for_address_ aren't
+    // redundant; address_block_ elements are created (for example) when
+    // a branch means the corresponding address must have a BasicBlock
+    // created for use as a branch target, but that doesn't mean code has
+    // been generated for it yet.
+    llvm::BasicBlock *address_block_[memory_size];
+    bool code_generated_for_address_[memory_size];
+
+    Register a_;
+    Register x_;
+    Register y_;
+    Register s_;
+    Register flag_n_;
+    Register flag_v_;
+    Register flag_d_;
+    Register flag_i_;
+    Register flag_z_;
+    Register flag_c_;
+    llvm::Value *pc_;
+
+    llvm::Value *read_callback_result_;
+    llvm::Value *p_tmp_;
+    llvm::Value *l_tmp_;
+    llvm::Value *s_tmp_;
+    llvm::Value *t_tmp_;
+
+    llvm::BasicBlock *epilogue_;
+};
+
+#endif
diff --git a/FunctionManager.cpp b/FunctionManager.cpp
new file mode 100644
index 0000000..51f60b7
--- /dev/null
+++ b/FunctionManager.cpp
@@ -0,0 +1,310 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "FunctionManager.h"
+
+#include <functional>
+
+#include "Function.h"
+#include "FunctionBuilder.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+FunctionManager::FunctionManager(M6502 *mpu)
+: jit_thread_idle_(true), work_available_(false), quit_(false), mpu_(mpu), 
+  memory_snapshot_(), function_for_address_(), code_at_address_()
+{
+}
+
+FunctionManager::~FunctionManager()
+{
+    if (jit_thread_.get_id() != boost::thread::id())
+    {
+        TRACE("Notifying JIT thread to quit");
+        {
+            boost::mutex::scoped_lock lock(jit_thread_cv_mutex_);
+            quit_ = true;
+        }
+        jit_thread_cv_.notify_all();
+        TRACE("Joining with JIT thread");
+        jit_thread_.join();
+    }
+}
+
+bool FunctionManager::jit_thread_idle()
+{
+    boost::mutex::scoped_lock lock(jit_thread_idle_mutex_);
+    return jit_thread_idle_;
+}
+
+void FunctionManager::update_memory_snapshot()
+{
+    assert(jit_thread_idle());
+
+    const uint8_t *memory = mpu_->memory;
+    for (size_t i = 0; i < memory_size; ++i)
+    {
+        if (code_at_address_[i] && (memory_snapshot_[i] != memory[i]))
+        {
+            code_modified_at(i);
+        }
+        memory_snapshot_[i] = memory[i];
+    }
+}
+
+Function *FunctionManager::build_function_internal(
+    uint16_t address, const uint8_t *ct_memory)
+{
+    Registers &registers = mpu_->internal->registers_;
+    TRACE("Building Function for code at 0x" << std::hex << std::setfill('0') <<
+          std::setw(4) << registers.pc);
+    FunctionBuilder fb(mpu_, ct_memory, code_at_address_, registers.pc);
+    boost::shared_ptr<Function> f(fb.build());
+    add_function(f);
+    return f.get();
+}
+
+Function *FunctionManager::build_function(uint16_t address, 
+                                          const uint8_t *ct_memory)
+{
+    Function *f;
+    int pass = 0;
+    do
+    {
+        assert(pass < 2);
+        ++pass;
+
+        f = build_function_internal(address, ct_memory);
+
+        bool f_is_optimistic_self_writer = false;
+        const AddressSet &code_range = f->code_range();
+        for (AddressSet::const_iterator it = code_range.begin();
+             it != code_range.end(); ++it)
+        {
+            uint16_t i = *it;
+            if (code_at_address_[i] && 
+                !optimistic_writers_for_address_[i].empty())
+            {
+                // There is now code at an address where optimistic writes are
+                // performed. Future code generation won't create optimistic
+                // writes there because code_at_address_[i] has now been set,
+                // but we need to destroy existing functions which perform
+                // that write so they will be regenerated.
+                const FunctionSet &optimistic_writers = 
+                    optimistic_writers_for_address_[i];
+                f_is_optimistic_self_writer = 
+                    (optimistic_writers.find(f) != optimistic_writers.end());
+                destroy_functions_in_set(optimistic_writers_for_address_[i]);
+                if (f_is_optimistic_self_writer)
+                {
+                    // destroy_functions_in_set() has now destroyed f, so a)
+                    // code_range is no longer a valid reference b) there's
+                    // no need to continue iterating over f's code range.
+                    break;
+                }
+
+            }
+        }
+
+        // We might just have destroyed the function we built, if it modified
+        // its own code, so we need to loop round if so.
+        f = function_for_address_[address];
+        if (f == 0)
+        {
+            assert(f_is_optimistic_self_writer);
+            TRACE("Rebuilding just-created function");
+        }
+    }
+    while (f == 0);
+
+    TRACE(f->dump_all());
+
+    return f;
+}
+
+void FunctionManager::build_function_lazy(uint16_t address)
+{
+    assert(jit_thread_idle());
+
+    TRACE("Will build Function for address 0x" << std::hex << 
+          std::setfill('0') << std::setw(4) << address << " in background");
+
+    // We only create the JIT thread the first time it's needed; this avoids it
+    // existing if the library is being used in interpreted or compiled mode.
+    if (jit_thread_.get_id() == boost::thread::id())
+    {
+        TRACE("Creating JIT thread");
+        boost::thread t(
+            std::mem_fun(&FunctionManager::build_function_thread), this);
+        jit_thread_.swap(t);
+    }
+
+    {
+        boost::mutex::scoped_lock lock(jit_thread_idle_mutex_);
+        jit_thread_idle_ = false;
+    }
+    {
+        boost::mutex::scoped_lock lock(jit_thread_cv_mutex_);
+        work_available_ = true;
+        jit_thread_address_ = address;
+    }
+    jit_thread_cv_.notify_all();
+}
+
+void FunctionManager::build_function_thread()
+{
+    try
+    {
+        TRACE("JIT thread started");
+        boost::mutex::scoped_lock jit_thread_cv_mutex_lock(
+            jit_thread_cv_mutex_);
+        while (true)
+        {
+            while (!quit_ && !work_available_)
+            {
+                TRACE("JIT thread waiting to be signalled");
+                jit_thread_cv_.wait(jit_thread_cv_mutex_lock);
+            }
+
+            if (quit_)
+            {
+                TRACE("JIT thread quitting");
+                return;
+            }
+            else
+            {
+                TRACE("JIT thread about to build Function at address 0x" <<
+                      std::hex << std::setfill('0') << std::setw(4) << 
+                      jit_thread_address_);
+                assert(work_available_);
+                assert(!jit_thread_idle_);
+
+                // Note that we translate code from memory_snapshot_
+                // not mpu_->memory. This is important, even though we
+                // have update_memory_snapshot() which "should" invalidate
+                // Function objects which depend on modified code before any
+                // of them are used. The reason is that if a memory location
+                // is temporarily modified by the interpreter before it can
+                // be translated, then modified back to its original value
+                // by the interpreter before update_memory_snapshot() is
+                // called, update_memory_snapshot() can't notice the change,
+                // but the change has been compiled into the Function object.
+                // (See test/z-self-modify-2.xa; this breaks in hybrid mode
+                // if memory_snapshot_ isn't used here.)
+                build_function(jit_thread_address_, memory_snapshot_);
+                work_available_ = false;
+
+                boost::mutex::scoped_lock jit_thread_idle_lock(
+                    jit_thread_idle_mutex_);
+                jit_thread_idle_ = true;
+            }
+        }
+    }
+    catch (std::exception &e)
+    {
+        die(e.what());
+    }
+}
+
+void FunctionManager::add_function(const boost::shared_ptr<Function> &f)
+{
+    function_for_address_[f->address()] = f.get();
+    function_for_address_owner_[f->address()] = f;
+
+    const AddressSet &code_range = f->code_range();
+    for (AddressSet::const_iterator it = code_range.begin(); 
+         it != code_range.end(); ++it)
+    {
+        uint16_t i = *it;
+        functions_covering_address_[i].insert(f.get());
+        code_at_address_[i] = true;
+    }
+
+    const AddressSet &optimistic_writes = f->optimistic_writes();
+    for (AddressSet::const_iterator it = optimistic_writes.begin();
+         it != optimistic_writes.end(); ++it)
+    {
+        uint16_t i = *it;
+        optimistic_writers_for_address_[i].insert(f.get());
+    }
+}
+
+void FunctionManager::code_modified_at(uint16_t address)
+{
+    // We could just return immediately if code_at_address_[address] is false;
+    // sometimes we call this function without bothering to check first.
+    // In practice I doubt this has a significant impact on performance.
+
+    TRACE("Code modified at 0x" << std::hex << std::setfill('0') << 
+          std::setw(4) << address);
+
+    destroy_functions_in_set(functions_covering_address_[address]);
+
+    // Keep memory_snapshot_ up-to-date; this avoids harmless-but-inefficient
+    // destruction of perfectly valid Function objects when
+    // update_memory_snapshot() is called next.
+    memory_snapshot_[address] = mpu_->memory[address];
+}
+
+void FunctionManager::destroy_functions_in_set(FunctionSet &function_set)
+{
+    // We iterate over the set like this because destroy_function() will erase
+    // the function from function_set, thereby invalidating any iterator we are
+    // holding on to.
+    while (!function_set.empty())
+    {
+        destroy_function(*function_set.begin());
+    }
+}
+
+void FunctionManager::destroy_function(Function *f)
+{
+    const AddressSet &code_range = f->code_range();
+    for (AddressSet::const_iterator it = code_range.begin(); 
+         it != code_range.end(); ++it)
+    {
+        uint16_t i = *it;
+        size_t erased_count = functions_covering_address_[i].erase(f);
+        ASSERT_EQUAL(erased_count, 1);
+        // We do *not* clear code_at_address_[i] even if
+        // functions_covering_address_[i] is now empty; this records the fact
+        // that we have executed code at this address. This is critical for
+        // the current implementation of build_function(); code_at_address_
+        // being set is used to control optimistic vs non-optimistic writes,
+        // and if code_at_address_ was cleared when a function was destroyed
+        // a self-modifying function would cause an infinite loop inside
+        // build_function(). It would be OK to clear code_at_address_ for any
+        // addresses with empty functions_covering_address_ sets at the end
+        // of build_function(), but we currently don't.
+    }
+
+    const AddressSet &optimistic_writes = f->optimistic_writes();
+    for (AddressSet::const_iterator it = optimistic_writes.begin();
+         it != optimistic_writes.end(); ++it)
+    {
+        uint16_t i = *it;
+        size_t erased_count = optimistic_writers_for_address_[i].erase(f);
+        ASSERT_EQUAL(erased_count, 1);
+    }
+
+    assert(function_for_address_[f->address()] == f);
+    function_for_address_[f->address()] = 0;
+    // Do this last as it will cause the Function object to be deleted.
+    assert(function_for_address_owner_[f->address()].get() == f);
+    function_for_address_owner_[f->address()].reset();
+}
diff --git a/FunctionManager.h b/FunctionManager.h
new file mode 100644
index 0000000..141fe7a
--- /dev/null
+++ b/FunctionManager.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTIONMANAGER_H
+#define FUNCTIONMANAGER_H
+
+#include <assert.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/thread/condition_variable.hpp>
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/utility.hpp>
+#include <set>
+#include <stdint.h>
+
+#include "const.h"
+#include "JitBool.h"
+#include "lib6502.h"
+
+class Function;
+
+class FunctionManager : boost::noncopyable
+{
+public:
+    FunctionManager(M6502 *mpu);
+    ~FunctionManager();
+
+    bool jit_thread_idle();
+
+    void update_memory_snapshot();
+
+    // Return a Function object representing the code starting at 'address'; if
+    // one does not already exist it will be created. This never returns null.
+    Function *get_function(uint16_t address)
+    {
+        Function *f = function_for_address_[address];
+        if (f != 0)
+        {
+            return f;
+        }
+        else
+        {
+            return build_function(address, mpu_->memory);
+        }
+    }
+
+    // Return a Function object representing the code starting at 'address',
+    // if one is available, otherwise return null. When null is returned
+    // a background thread may be used to generate a Function object which
+    // can be returned if the request is repeated in the future.
+    //
+    // This function may only be called if the last call to jit_thread_idle()
+    // returned true and no call has been made to get_function_lazy() since
+    // jit_thread_idle() was called.
+    //
+    // Currently a background thread will *always* be invoked if null is
+    // returned, but this is not guaranteed. For example, we may wish to
+    // refuse to waste time building a Function object which we expect to
+    // be invalidated by self-modifying code shortly afterwards.
+    Function *get_function_lazy(uint16_t address)
+    {
+        // This assert() is perfectly correct, but it single-handedly destroys
+        // the performance of a debug build; it's just not *that* valuable.
+        // assert(jit_thread_idle());
+
+        Function *f = function_for_address_[address];
+        if (f != 0)
+        {
+            return f;
+        }
+        else
+        {
+            build_function_lazy(address);
+            return 0;
+        }
+    }
+
+    void code_modified_at(uint16_t address);
+
+private:
+    void add_function(const boost::shared_ptr<Function> &f);
+
+    Function *build_function(uint16_t address, const uint8_t *ct_memory);
+    Function *build_function_internal(uint16_t address, 
+                                      const uint8_t *ct_memory);
+
+    void build_function_lazy(uint16_t address);
+    void build_function_thread();
+
+    typedef std::set<Function *> FunctionSet;
+    void destroy_functions_in_set(FunctionSet &function_set);
+
+    void destroy_function(Function *f);
+
+    boost::thread jit_thread_;
+
+    boost::mutex jit_thread_idle_mutex_;
+    bool jit_thread_idle_;
+
+    boost::mutex jit_thread_cv_mutex_;
+    boost::condition_variable jit_thread_cv_;
+    bool work_available_;
+    uint16_t jit_thread_address_;
+    bool quit_;
+
+    M6502 *mpu_;
+
+    // A copy of the emulated CPU's memory, used to detect changes to already
+    // JITted code which happen in callbacks and to avoid problems with JITting
+    // while the interpreter is running (in hybrid mode).
+    uint8_t memory_snapshot_[memory_size];
+
+    // We maintain this array of shared_ptr's which actually own the
+    // Function objects.
+    boost::shared_ptr<Function> function_for_address_owner_[memory_size];
+
+    // We maintain a parallel array of raw pointers here so that we have
+    // the option to allow JITted code to access it.
+    Function *function_for_address_[memory_size];
+
+    // This tracks the Function objects which contain code generated based on
+    // individual addresses, i.e. the Function objects which are invalidated by
+    // a store to a given memory location.
+    FunctionSet functions_covering_address_[memory_size];
+
+    // This tracks the Function objects which perform optimistic writes to
+    // individual addresses, i.e. the Function objects which are invalidated if
+    // it turns out an address is in fact used to hold code.
+    FunctionSet optimistic_writers_for_address_[memory_size];
+
+    // This tracks whether we have ever executed code at a given address;
+    // destroying all the functions in the corresponding element of
+    // functions_covering_address does *not* mean this is cleared.
+    JitBool code_at_address_[memory_size];
+};
+
+#endif
diff --git a/JitBool.h b/JitBool.h
new file mode 100644
index 0000000..818008e
--- /dev/null
+++ b/JitBool.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+// JitBool is a typedef representing the type used for boolean flags in the
+// JITted code, i.e. the CPU flag values and the 'code modified at' flag for
+// each memory address. In reality this is not likely to change, but this at
+// least helps to identify code which needs to change to support a different
+// representation. FunctionBuilder.cpp also contains a number of helper
+// functions which depend on the underlying type of JitBool.
+
+#ifndef JITBOOL_H
+#define JITBOOL_H
+
+typedef uint8_t JitBool;
+const JitBool jit_bool_false = 0;
+const JitBool jit_bool_true = 1;
+
+#endif
diff --git a/LLVMStuff.cpp b/LLVMStuff.cpp
new file mode 100644
index 0000000..faebdf8
--- /dev/null
+++ b/LLVMStuff.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "LLVMStuff.h"
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/TargetSelect.h"
+
+LLVMStuff::LLVMStuff()
+: module_(new llvm::Module("lib6502-jit", llvm::getGlobalContext())),
+  builder_(llvm::getGlobalContext())
+{
+    llvm::InitializeNativeTarget();
+
+    std::string error;
+    execution_engine_ = 
+        llvm::EngineBuilder(module_.get()).setErrorStr(&error).create();
+    if (execution_engine_ == 0)
+    {
+        throw std::runtime_error("Could not create LLVM ExecutionEngine: " + 
+                                 error);
+    }
+}
+
+LLVMStuff::~LLVMStuff()
+{
+}
diff --git a/LLVMStuff.h b/LLVMStuff.h
new file mode 100644
index 0000000..7ba9d31
--- /dev/null
+++ b/LLVMStuff.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef LLVMSTUFF_H
+#define LLVMSTUFF_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include <stdexcept>
+
+struct LLVMStuff : boost::noncopyable
+{
+    LLVMStuff();
+    ~LLVMStuff();
+
+    llvm::ExecutionEngine *execution_engine_;
+    boost::shared_ptr<llvm::Module> module_;
+    llvm::IRBuilder<> builder_;
+
+};
+
+#endif
diff --git a/M6502Internal.h b/M6502Internal.h
new file mode 100644
index 0000000..c54131c
--- /dev/null
+++ b/M6502Internal.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef M6502INTERNAL_H
+#define M6502INTERNAL_H
+
+#include "FunctionManager.h"
+#include "lib6502.h"
+#include "LLVMStuff.h"
+#include "Registers.h"
+
+struct _M6502_Internal                                                           
+{                                                                                
+    _M6502_Internal(M6502 *mpu)
+    : function_manager_(mpu), mode_(M6502_ModeHybrid), 
+      max_instructions_(default_max_instructions_)
+    {
+    }
+
+    Registers registers_;                                                        
+    LLVMStuff llvm_stuff_;                                                       
+    FunctionManager function_manager_;                                           
+
+    M6502_Mode mode_;
+    static const int default_max_instructions_ = 500;
+    int max_instructions_;
+};                                                                               
+
+#endif
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..879e06d
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,130 @@
+ACLOCAL_AMFLAGS = -I m4
+AM_CPPFLAGS = `$(LLVMCONFIG) --cppflags` $(BOOST_CPPFLAGS)
+# lib6502.c generates spurious warnings with -Wall, so we want -Wno-parentheses
+# too. It's not easy to have per-source-file build flags in automake, so we
+# just apply this to all C files.
+AM_CFLAGS = -Wall -Wno-parentheses
+AM_CXXFLAGS = `$(LLVMCONFIG) --cxxflags` -fexceptions -Wall
+AM_LDFLAGS = $(BOOST_THREAD_LDFLAGS)
+LIBS = `$(LLVMCONFIG) --ldflags --libs core jit native --system-libs` $(BOOST_THREAD_LIBS)
+
+# Some of these are included automatically, but I'd rather be explicit.
+EXTRA_DIST = \
+	examples/README \
+	COPYING \
+	CREDITS \
+	lib6502-compatibility.txt \
+	README \
+	README.lib6502 \
+	TODO \
+	man/* \
+	test/*.xa \
+	test/*.mst \
+	test/run-c-tests.sh \
+	test/run-run6502-tests.sh \
+	test/run-c-tests.py \
+	test/run-run6502-tests.py
+
+man1_MANS = man/*.1
+man3_MANS = man/*.3
+lib_LTLIBRARIES = lib6502-jit.la
+include_HEADERS = lib6502.h
+bin_PROGRAMS = run6502
+noinst_PROGRAMS = \
+	examples/lib1
+check_PROGRAMS = \
+	test/basic-callback \
+	test/call-illegal-callback-modify-code \
+	test/irq-nmi \
+	test/setjmp-trick \
+	test/stack-code-brk \
+	test/stack-code-jsr \
+	test/write-callback-modify-code
+
+lib6502_jit_la_SOURCES = \
+	AddressRange.cpp \
+	AddressRange.h \
+	AddressSet.cpp \
+	AddressSet.h \
+	const.h \
+	Function.cpp \
+	Function.h \
+	FunctionBuilder.cpp \
+	FunctionBuilder.h \
+	FunctionManager.cpp \
+	FunctionManager.h \
+	JitBool.h \
+	lib6502.c \
+	lib6502.h \
+	lib6502-jit.cpp \
+	LLVMStuff.cpp \
+	LLVMStuff.h \
+	M6502Internal.h \
+	Registers.cpp \
+	Registers.h \
+	util.cpp \
+	util.h \
+	valgrind.h
+
+run6502_SOURCES = \
+	run6502.c
+run6502_LINK = $(CXXLINK)
+run6502_LDADD = lib6502-jit.la
+
+examples_lib1_SOURCES = \
+	examples/lib1.c
+examples_lib1_LINK = $(CXXLINK)
+examples_lib1_LDADD = lib6502-jit.la
+
+test_basic_callback_SOURCES = \
+	test/basic-callback.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_basic_callback_LINK = $(CXXLINK)
+test_basic_callback_LDADD = lib6502-jit.la
+
+test_call_illegal_callback_modify_code_SOURCES = \
+	test/call-illegal-callback-modify-code.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_call_illegal_callback_modify_code_LINK = $(CXXLINK)
+test_call_illegal_callback_modify_code_LDADD = lib6502-jit.la
+
+test_irq_nmi_SOURCES = \
+	test/irq-nmi.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_irq_nmi_LINK = $(CXXLINK)
+test_irq_nmi_LDADD = lib6502-jit.la
+
+test_setjmp_trick_SOURCES = \
+	test/setjmp-trick.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_setjmp_trick_LINK = $(CXXLINK)
+test_setjmp_trick_LDADD = lib6502-jit.la
+
+test_stack_code_brk_SOURCES = \
+	test/stack-code-brk.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_stack_code_brk_LINK = $(CXXLINK)
+test_stack_code_brk_LDADD = lib6502-jit.la
+
+test_stack_code_jsr_SOURCES = \
+	test/stack-code-jsr.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_stack_code_jsr_LINK = $(CXXLINK)
+test_stack_code_jsr_LDADD = lib6502-jit.la
+
+test_write_callback_modify_code_SOURCES = \
+	test/write-callback-modify-code.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_write_callback_modify_code_LINK = $(CXXLINK)
+test_write_callback_modify_code_LDADD = lib6502-jit.la
+
+TESTS = \
+	test/run-c-tests.sh \
+	test/run-run6502-tests.sh
diff --git a/README b/README
new file mode 100644
index 0000000..0620f77
--- /dev/null
+++ b/README
@@ -0,0 +1,84 @@
+lib6502-jit is a (mostly) compatible implementation of Ian Piumarta's lib6502
+which uses LLVM to perform JIT compilation of 6502 machine code to host code.
+This will doubtless be useful to the large community of people stuck doing
+number-crunching tasks with legacy 6502 code. :-)
+
+README.lib6502 is a copy of the original lib6502 README. You should probably go
+and read that before reading any further.
+
+lib6502-compatibility.txt documents the differences between lib6502 and
+lib6502-jit.
+
+CREDITS contains acknowledgements of the various people and groups on whose
+work lib6502-jit is built.
+
+COPYING contains license details for lib6502-jit.
+
+TODO contains some notes on possible enhancements to lib6502-jit.
+
+How to build:
+
+You'll need the following installed:
+- a C/C++ compiler (I've tested with gcc 4.7.2, gcc 4.8.2 and clang 3.5)
+- LLVM development libraries (I've tested with various 3.5 pre-release snapshots)
+- boost (including boost::thread) (I've tested with 1.49, 1.54 and 1.55)
+
+I have somewhat reluctantly set up an autotools build system; compiling and
+linking against LLVM and boost::thread on different platforms was otherwise
+just that bit too fiddly. So in theory all you need to do is:
+
+    ./configure
+    make
+
+I suggest you actually do:
+    CFLAGS='-g -O3' CXXFLAGS='-g -O3' ./configure
+to increase the optimisation level. (I would have made that the default, but
+apparently that would go against user expectations for an autotools build
+system.)
+
+"make install" should work as well if you feel inclined to do so, but it's not
+necessary.
+
+I've tested on three platforms, and for what it's worth here are more detailed
+instructions for those:
+
+Ubuntu (14.04 x86):
+    apt-get install libboost-dev libboost-thread-dev llvm-3.5-dev libedit-dev
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure --with-llvm-config=llvm-config-3.5
+    make
+
+Debian (7.5 x86-64):
+    apt-get install libboost-dev libboost-thread-dev 
+    [I used the llvm-3.5-dev package from the wheezy repository here: http://llvm.org/apt/]
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure
+    make
+
+FreeBSD (10.0-RELEASE x86-64):
+    pkg install boost-all-1.55.0
+    pkg install llvm-devel-3.5.r203994
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure --with-llvm-config=/usr/local/llvm-devel/bin/llvm-config
+    make
+
+There are some tests which will run if you type "make check". Some will be
+skipped unless you have the "xa" assembler
+(http://www.floodgap.com/retrotech/xa/) on your PATH.
+
+The above assumes you downloaded a lib6502-jit*tar.bz2 package, which will
+contain a "configure" script. This is not (following what I understand to be
+best practice) checked into source control, so if you downloaded the source
+using something like git or svn, you need to either:
+- download the tarball - it will be much easier, especially if you're just
+  taking a quick look at lib6502-jit and don't plan to make changes to the code
+  (yet)
+- install autoconf, automake and libtool, then cross your fingers and run
+  "autoreconf -i", which will generate a "configure" script for you if you're
+  lucky.
+
+If you have any queries, comments or bug reports, please drop me (Steven
+Flintham) an e-mail at lib6502-jit@lemma.co.uk.
diff --git a/README.lib6502 b/README.lib6502
new file mode 100644
index 0000000..b79e595
--- /dev/null
+++ b/README.lib6502
@@ -0,0 +1,136 @@
+		lib6502 - 6502 Microprocessor Emulator
+
+			Version: 1.0
+
+
+WHAT IF I'M TOO LAZY TO READ 'README'S?
+
+	make
+	make install
+	more examples/README
+
+
+WHAT IS LIB6502?
+
+  lib6502 is a library that emulates the 6502 microprocessor.  It
+  comes with a small 'shell', run6502, that can execute 6502 programs
+  from the command line.
+
+  lib6502 is distributed under the MIT license: it is non-infectious
+  and will not make your projects contagious to others the instant you
+  choose to use lib6502 in them.  See the file COPYING for details.
+
+
+WHERE IS THE LATEST SOURCE CODE?
+
+  Source code for lib6502 is available from the author's home page at
+  'http://piumarta.com/software'.  You can download the most recent
+  release or use Subversion to get the very latest sources.
+
+
+WHERE IS THE DOCUMENTATION?
+
+  Manual pages for run6502 and lib6502 (and all the functions it
+  exports) should be available once it is installed.  Each includes a
+  short 'examples' section.  Use the 'man' command to read them.
+
+  Your best place to start looking for documentation on the 6502
+  itself is 'http://6502.org'.  A google search of the web will also
+  turn up vast quantities of information about (and programs for) the
+  6502.
+
+
+HOW DO I INSTALL IT?
+
+  It's not really big enough to warrant the whole 'configure' thing.
+  Any system with an ANSI compiler and C library should be able to
+  compile it out of the box.  After unpacking the archive, just type:
+
+	make
+
+  to build it.  If the compiler blows up immediately, edit the
+  Makefile and play with the '-g' and '-O' flags and then try again.
+  If you really can't make the compiler happy you've found a bug (read
+  the next section but one).  Otherwise, if you want it put it
+  somewhere more permanent then type:
+
+	make install
+
+  (as root) to install it.  It goes into /usr/local by default; if you
+  want it elsewhere then set PREFIX in the make command.  For example:
+
+	make install PREFIX=/usr
+
+  will put everything under '/usr'.
+
+  When you get bored with it, go back to the source directory and
+  type:
+
+	make uninstall
+
+  (with the same PREFIX you specified during the install, if
+  necessary.)
+
+
+WHAT CAN I DO WITH IT?
+
+  See the file EXAMPLES for some suggestions (all of them polite).
+
+  If that leaves you wanting more, read the source for run6502 -- it
+  exercises just about every feature in lib6502.
+
+
+HOW DO I REPORT PROBLEMS?^W^WCONTACT THE ORIGINAL AUTHOR?
+
+  [If you wish to get in touch with the author of lib6502, this is the
+  address to use. Since lib6502-jit is based on lib6502 but has been
+  heavily modified, please do *not* report problems to this address;
+  use the address in README instead. -- Steve]
+
+  Send e-mail to the author at: firstName (at) lastName (dot) com
+
+  (For suitable values of firstName and lastName, see the last section
+  of this file.)
+
+  If you're still confused, contact him at: http://piumarta.com
+
+
+HOW CAN I HELP?
+
+  Use it.  Find bugs.  Fix bugs.  Make it faster.  Evangelism: spread
+  it to as many other projects as possible, especially those that
+  might be using a slower emulator!  Read the manual pages to see
+  what's considered missing, then add it, then send it in.
+
+  (One thing that would be be really handy, and isn't mentioned in the
+  manual pages, is a test suite.  Figure out how to test every mode in
+  every instruction with every possible combination of operand values
+  and condition codes and verify the behaviour is correct.  Then write
+  it down in the form of a program and send it in.  If it's a
+  self-contained program that runs once to completion then we can
+  probably find some real hardware to test against the test suite.)
+
+  If you know how to write software that emulates peripheral hardware
+  devices, google up some details on the popular 6502-based
+  microcomputers (Acorn, Commodore, etc.) and add some serious system
+  emulation to run6502.  Make it all pluggable (think dynamic
+  libraries over an 'agnostic' core), so we can change machines at the
+  flip of a (command-line) switch.  (The callback mechanism in lib6502
+  was designed with this kind of 'pluggable hardware emulation' in
+  mind.)
+
+
+WHO WROTE THIS STUFF, AND WHY?
+
+  lib6502 was written by Ian Piumarta.
+
+  While writing ccg (an entirely different project that creates
+  runtime assemblers for dynamic code generators) he decided to
+  include support for an 8-bit microprocessor, just for fun.  He chose
+  the 6502 because it was used in the first computer he owned and
+  programmed (an Ohio Scientific Superboard II, when he was 14) as
+  well as the second (an Acorn 'BBC Model B', about four years later).
+  lib6502 started as a 'glorified switch statement' that ran some
+  small test programs spewed into memory by ccg, but rapidly got out
+  of control over the course of a weekend.  You're looking at the
+  result.
diff --git a/Registers.cpp b/Registers.cpp
new file mode 100644
index 0000000..7070557
--- /dev/null
+++ b/Registers.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "Registers.h"
+
+#include "const.h"
+#include "lib6502.h"
+#include "M6502Internal.h"
+
+void Registers::to_M6502_Registers(M6502 *mpu) const
+{
+    M6502_Registers &er = *(mpu->registers);
+    Registers &ir = mpu->internal->registers_;
+
+    er.a = ir.a;
+    er.x = ir.x;
+    er.y = ir.y;
+    er.s = ir.s;
+    er.p = 0;
+    if (ir.flag_n) er.p |= flagN;
+    if (ir.flag_v) er.p |= flagV;
+    if (ir.flag_d) er.p |= flagD;
+    if (ir.flag_i) er.p |= flagI;
+    if (ir.flag_z) er.p |= flagZ;
+    if (ir.flag_c) er.p |= flagC;
+    er.pc = ir.pc;
+}
+
+void Registers::from_M6502_Registers(const M6502 *mpu)
+{
+    M6502_Registers &er = *(mpu->registers);
+    Registers &ir = mpu->internal->registers_;
+
+    ir.a = er.a;
+    ir.x = er.x;
+    ir.y = er.y;
+    ir.s = er.s;
+    ir.flag_n = ((er.p & flagN) != 0);
+    ir.flag_v = ((er.p & flagV) != 0);
+    ir.flag_d = ((er.p & flagD) != 0);
+    ir.flag_i = ((er.p & flagI) != 0);
+    ir.flag_z = ((er.p & flagZ) != 0);
+    ir.flag_c = ((er.p & flagC) != 0);
+    ir.pc = er.pc;
+}
diff --git a/Registers.h b/Registers.h
new file mode 100644
index 0000000..467065a
--- /dev/null
+++ b/Registers.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef REGISTERS_H 
+#define REGISTERS_H
+
+#include <boost/utility.hpp>
+#include <stdint.h>
+
+#include "JitBool.h"
+
+typedef struct _M6502 M6502;
+
+struct Registers : boost::noncopyable
+{
+    uint8_t a;
+    uint8_t x;
+    uint8_t y;
+    uint8_t s;
+    JitBool flag_n;
+    JitBool flag_v;
+    JitBool flag_d;
+    JitBool flag_i;
+    JitBool flag_z;
+    JitBool flag_c;
+    uint16_t pc;
+
+    // Pseudo-registers used to communicate state for callbacks; see the
+    // comment describing the Result enumeration in FunctionBuilder.h.
+    uint16_t addr;
+    uint8_t data;
+
+    void to_M6502_Registers(M6502 *mpu) const;
+    void from_M6502_Registers(const M6502 *mpu);
+};
+
+#endif
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..d57ecb6
--- /dev/null
+++ b/TODO
@@ -0,0 +1,67 @@
+It would be interesting to see if this works OK on an ARM machine.
+
+
+Running e.g. z-self-modify-1 to completion in -mc -mx 1 mode shows the memory
+for the run6502 process grows steadily, but valgrind doesn't show any leaks. A
+quick web search suggests this might be internal leaks in LLVM (which are only
+exposed by things like this which continually JIT). I am inclined to leave this
+and perhaps come back to it once LLVM 3.5 is actuallly released; if there's
+still a problem then it might be worth tracking it down.
+
+
+Would it be helpful to pass branch weights to CreateCondBr()? For example,
+where we have a computed address which might trigger a read/write callback, we
+could calculate the proportion of addresses in the address range which have
+callbacks on them and use that as the probability of taking the callback-exists
+branch.
+
+
+We could potentially use Function objects to deduce properties of stretches of
+code and use that information to improve the generated code. For example, if we
+observed that a Function object didn't contain any external calls or any
+stack-modification instructions except RTS then we could inline it in any
+callers (adding its code ranges to their code ranges, of course) and the RTS
+could be a no-op. (For 100% accuracy, the JSR should still push the return
+address on the stack but not modify the stack pointer. Code executed later on
+might peek at the stack and expect those values to be there.) This might in
+turn allow the callers of that Function to be inlined themselves. This is just
+an example. It may be that in practice deciding when to re-translate code would
+cause a sufficient performance impact to just not be worth it in the first
+place.
+
+
+We could add support for counting the number of cycles executed by the JITted
+code; lib6502 itself has some support for this in the form of the tick* macros,
+but they don't do anything by default.
+
+
+Would there be any performance improvement to be had by having Function objects
+(tail) call one another where possible?
+
+
+Hybrid mode currently makes no attempt to avoid re-generating Function objects
+which are continually being invalidated due to self-modifying code. It might be
+nice if some heuristic caused us to avoid this unnecessary work and just let
+the interpreter always handle that code.
+
+On a related but distinct note, currently once an element of
+FunctionManager::code_at_address_ is set, it is never cleared. This might cause
+us to avoid optimistic writes which in reality would be OK. We could use some
+heuristic to decide when to destroy Function objects which have not been
+executed in a long time, and start clearing code_at_address_ elements when all
+functions covering an address are removed. (See the note in
+FunctionManager::destroyFunction(); this clearing must be done *outside* the
+loop in FunctionManager::buildFunction(), or the implementation of
+buildFunction() must be tweaked.)
+
+However, it may be that it just isn't worth being that clever. Any such code
+would need to be triggered inside the main loop between executions of Function
+objects. We could do it only every nth time, and keeping track of how many
+times we've been round probably wouldn't significantly harm performance, but be
+careful.
+
+
+Would a different default value for max_instructions be better?
+
+
+Are there any other LLVM optimisation passes which would be helpful?
diff --git a/build-aux/tap-driver.sh b/build-aux/tap-driver.sh
new file mode 100755
index 0000000..c011298
--- /dev/null
+++ b/build-aux/tap-driver.sh
@@ -0,0 +1,649 @@
+#! /bin/sh
+# Copyright (C) 2011 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+scriptversion=2011-12-27.17; # UTC
+
+# Make unconditional expansion of undefined variables an error.  This
+# helps a lot in preventing typo-related bugs.
+set -u
+
+me=tap-driver.sh
+
+fatal ()
+{
+  echo "$me: fatal: $*" >&2
+  exit 1
+}
+
+usage_error ()
+{
+  echo "$me: $*" >&2
+  print_usage >&2
+  exit 2
+}
+
+print_usage ()
+{
+  cat <<END
+Usage:
+  tap-driver.sh --test-name=NAME --log-file=PATH --trs-file=PATH
+                [--expect-failure={yes|no}] [--color-tests={yes|no}]
+                [--enable-hard-errors={yes|no}] [--ignore-exit]
+                [--diagnostic-string=STRING] [--merge|--no-merge]
+                [--comments|--no-comments] [--] TEST-COMMAND
+The \`--test-name', \`--log-file' and \`--trs-file' options are mandatory.
+END
+}
+
+# TODO: better error handling in option parsing (in particular, ensure
+# TODO: $log_file, $trs_file and $test_name are defined).
+test_name= # Used for reporting.
+log_file=  # Where to save the result and output of the test script.
+trs_file=  # Where to save the metadata of the test run.
+expect_failure=0
+color_tests=0
+merge=0
+ignore_exit=0
+comments=0
+diag_string='#'
+while test $# -gt 0; do
+  case $1 in
+  --help) print_usage; exit $?;;
+  --version) echo "$me $scriptversion"; exit $?;;
+  --test-name) test_name=$2; shift;;
+  --log-file) log_file=$2; shift;;
+  --trs-file) trs_file=$2; shift;;
+  --color-tests) color_tests=$2; shift;;
+  --expect-failure) expect_failure=$2; shift;;
+  --enable-hard-errors) shift;; # No-op.
+  --merge) merge=1;;
+  --no-merge) merge=0;;
+  --ignore-exit) ignore_exit=1;;
+  --comments) comments=1;;
+  --no-comments) comments=0;;
+  --diagnostic-string) diag_string=$2; shift;;
+  --) shift; break;;
+  -*) usage_error "invalid option: '$1'";;
+  esac
+  shift
+done
+
+test $# -gt 0 || usage_error "missing test command"
+
+case $expect_failure in
+  yes) expect_failure=1;;
+    *) expect_failure=0;;
+esac
+
+if test $color_tests = yes; then
+  init_colors='
+    color_map["red"]="[0;31m" # Red.
+    color_map["grn"]="[0;32m" # Green.
+    color_map["lgn"]="[1;32m" # Light green.
+    color_map["blu"]="[1;34m" # Blue.
+    color_map["mgn"]="[0;35m" # Magenta.
+    color_map["std"]="[m"     # No color.
+    color_for_result["ERROR"] = "mgn"
+    color_for_result["PASS"]  = "grn"
+    color_for_result["XPASS"] = "red"
+    color_for_result["FAIL"]  = "red"
+    color_for_result["XFAIL"] = "lgn"
+    color_for_result["SKIP"]  = "blu"'
+else
+  init_colors=''
+fi
+
+{
+  (
+    # Ignore common signals (in this subshell only!), to avoid potential
+    # problems with Korn shells.  Some Korn shells are known to propagate
+    # to themselves signals that have killed a child process they were
+    # waiting for; this is done at least for SIGINT (and usually only for
+    # it, in truth).  Without the `trap' below, such a behaviour could
+    # cause a premature exit in the current subshell, e.g., in case the
+    # test command it runs gets terminated by a SIGINT.  Thus, the awk
+    # script we are piping into would never seen the exit status it
+    # expects on its last input line (which is displayed below by the
+    # last `echo $?' statement), and would thus die reporting an internal
+    # error.
+    # For more information, see the Autoconf manual and the threads:
+    # <http://lists.gnu.org/archive/html/bug-autoconf/2011-09/msg00004.html>
+    # <http://mail.opensolaris.org/pipermail/ksh93-integration-discuss/2009-February/004121.html>
+    trap : 1 3 2 13 15
+    if test $merge -gt 0; then
+      exec 2>&1
+    else
+      exec 2>&3
+    fi
+    "$@"
+    echo $?
+  ) | LC_ALL=C ${AM_TAP_AWK-awk} \
+        -v me="$me" \
+        -v test_script_name="$test_name" \
+        -v log_file="$log_file" \
+        -v trs_file="$trs_file" \
+        -v expect_failure="$expect_failure" \
+        -v merge="$merge" \
+        -v ignore_exit="$ignore_exit" \
+        -v comments="$comments" \
+        -v diag_string="$diag_string" \
+'
+# FIXME: the usages of "cat >&3" below could be optimized when using
+# FIXME: GNU awk, and/on on systems that supports /dev/fd/.
+
+# Implementation note: in what follows, `result_obj` will be an
+# associative array that (partly) simulates a TAP result object
+# from the `TAP::Parser` perl module.
+
+## ----------- ##
+##  FUNCTIONS  ##
+## ----------- ##
+
+function fatal(msg)
+{
+  print me ": " msg | "cat >&2"
+  exit 1
+}
+
+function abort(where)
+{
+  fatal("internal error " where)
+}
+
+# Convert a boolean to a "yes"/"no" string.
+function yn(bool)
+{
+  return bool ? "yes" : "no";
+}
+
+function add_test_result(result)
+{
+  if (!test_results_index)
+    test_results_index = 0
+  test_results_list[test_results_index] = result
+  test_results_index += 1
+  test_results_seen[result] = 1;
+}
+
+# Whether the test script should be re-run by "make recheck".
+function must_recheck()
+{
+  for (k in test_results_seen)
+    if (k != "XFAIL" && k != "PASS" && k != "SKIP")
+      return 1
+  return 0
+}
+
+# Whether the content of the log file associated to this test should
+# be copied into the "global" test-suite.log.
+function copy_in_global_log()
+{
+  for (k in test_results_seen)
+    if (k != "PASS")
+      return 1
+  return 0
+}
+
+# FIXME: this can certainly be improved ...
+function get_global_test_result()
+{
+    if ("ERROR" in test_results_seen)
+      return "ERROR"
+    if ("FAIL" in test_results_seen || "XPASS" in test_results_seen)
+      return "FAIL"
+    all_skipped = 1
+    for (k in test_results_seen)
+      if (k != "SKIP")
+        all_skipped = 0
+    if (all_skipped)
+      return "SKIP"
+    return "PASS";
+}
+
+function stringify_result_obj(result_obj)
+{
+  if (result_obj["is_unplanned"] || result_obj["number"] != testno)
+    return "ERROR"
+
+  if (plan_seen == LATE_PLAN)
+    return "ERROR"
+
+  if (result_obj["directive"] == "TODO")
+    return result_obj["is_ok"] ? "XPASS" : "XFAIL"
+
+  if (result_obj["directive"] == "SKIP")
+    return result_obj["is_ok"] ? "SKIP" : COOKED_FAIL;
+
+  if (length(result_obj["directive"]))
+      abort("in function stringify_result_obj()")
+
+  return result_obj["is_ok"] ? COOKED_PASS : COOKED_FAIL
+}
+
+function decorate_result(result)
+{
+  color_name = color_for_result[result]
+  if (color_name)
+    return color_map[color_name] "" result "" color_map["std"]
+  # If we are not using colorized output, or if we do not know how
+  # to colorize the given result, we should return it unchanged.
+  return result
+}
+
+function report(result, details)
+{
+  if (result ~ /^(X?(PASS|FAIL)|SKIP|ERROR)/)
+    {
+      msg = ": " test_script_name
+      add_test_result(result)
+    }
+  else if (result == "#")
+    {
+      msg = " " test_script_name ":"
+    }
+  else
+    {
+      abort("in function report()")
+    }
+  if (length(details))
+    msg = msg " " details
+  # Output on console might be colorized.
+  print decorate_result(result) msg
+  # Log the result in the log file too, to help debugging (this is
+  # especially true when said result is a TAP error or "Bail out!").
+  print result msg | "cat >&3";
+}
+
+function testsuite_error(error_message)
+{
+  report("ERROR", "- " error_message)
+}
+
+function handle_tap_result()
+{
+  details = result_obj["number"];
+  if (length(result_obj["description"]))
+    details = details " " result_obj["description"]
+
+  if (plan_seen == LATE_PLAN)
+    {
+      details = details " # AFTER LATE PLAN";
+    }
+  else if (result_obj["is_unplanned"])
+    {
+       details = details " # UNPLANNED";
+    }
+  else if (result_obj["number"] != testno)
+    {
+       details = sprintf("%s # OUT-OF-ORDER (expecting %d)",
+                         details, testno);
+    }
+  else if (result_obj["directive"])
+    {
+      details = details " # " result_obj["directive"];
+      if (length(result_obj["explanation"]))
+        details = details " " result_obj["explanation"]
+    }
+
+  report(stringify_result_obj(result_obj), details)
+}
+
+# `skip_reason` should be empty whenever planned > 0.
+function handle_tap_plan(planned, skip_reason)
+{
+  planned += 0 # Avoid getting confused if, say, `planned` is "00"
+  if (length(skip_reason) && planned > 0)
+    abort("in function handle_tap_plan()")
+  if (plan_seen)
+    {
+      # Error, only one plan per stream is acceptable.
+      testsuite_error("multiple test plans")
+      return;
+    }
+  planned_tests = planned
+  # The TAP plan can come before or after *all* the TAP results; we speak
+  # respectively of an "early" or a "late" plan.  If we see the plan line
+  # after at least one TAP result has been seen, assume we have a late
+  # plan; in this case, any further test result seen after the plan will
+  # be flagged as an error.
+  plan_seen = (testno >= 1 ? LATE_PLAN : EARLY_PLAN)
+  # If testno > 0, we have an error ("too many tests run") that will be
+  # automatically dealt with later, so do not worry about it here.  If
+  # $plan_seen is true, we have an error due to a repeated plan, and that
+  # has already been dealt with above.  Otherwise, we have a valid "plan
+  # with SKIP" specification, and should report it as a particular kind
+  # of SKIP result.
+  if (planned == 0 && testno == 0)
+    {
+      if (length(skip_reason))
+        skip_reason = "- "  skip_reason;
+      report("SKIP", skip_reason);
+    }
+}
+
+function extract_tap_comment(line)
+{
+  if (index(line, diag_string) == 1)
+    {
+      # Strip leading `diag_string` from `line`.
+      line = substr(line, length(diag_string) + 1)
+      # And strip any leading and trailing whitespace left.
+      sub("^[ \t]*", "", line)
+      sub("[ \t]*$", "", line)
+      # Return what is left (if any).
+      return line;
+    }
+  return "";
+}
+
+# When this function is called, we know that line is a TAP result line,
+# so that it matches the (perl) RE "^(not )?ok\b".
+function setup_result_obj(line)
+{
+  # Get the result, and remove it from the line.
+  result_obj["is_ok"] = (substr(line, 1, 2) == "ok" ? 1 : 0)
+  sub("^(not )?ok[ \t]*", "", line)
+
+  # If the result has an explicit number, get it and strip it; otherwise,
+  # automatically assing the next progresive number to it.
+  if (line ~ /^[0-9]+$/ || line ~ /^[0-9]+[^a-zA-Z0-9_]/)
+    {
+      match(line, "^[0-9]+")
+      # The final `+ 0` is to normalize numbers with leading zeros.
+      result_obj["number"] = substr(line, 1, RLENGTH) + 0
+      line = substr(line, RLENGTH + 1)
+    }
+  else
+    {
+      result_obj["number"] = testno
+    }
+
+  if (plan_seen == LATE_PLAN)
+    # No further test results are acceptable after a "late" TAP plan
+    # has been seen.
+    result_obj["is_unplanned"] = 1
+  else if (plan_seen && testno > planned_tests)
+    result_obj["is_unplanned"] = 1
+  else
+    result_obj["is_unplanned"] = 0
+
+  # Strip trailing and leading whitespace.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+
+  # This will have to be corrected if we have a "TODO"/"SKIP" directive.
+  result_obj["description"] = line
+  result_obj["directive"] = ""
+  result_obj["explanation"] = ""
+
+  if (index(line, "#") == 0)
+    return # No possible directive, nothing more to do.
+
+  # Directives are case-insensitive.
+  rx = "[ \t]*#[ \t]*([tT][oO][dD][oO]|[sS][kK][iI][pP])[ \t]*"
+
+  # See whether we have the directive, and if yes, where.
+  pos = match(line, rx "$")
+  if (!pos)
+    pos = match(line, rx "[^a-zA-Z0-9_]")
+
+  # If there was no TAP directive, we have nothing more to do.
+  if (!pos)
+    return
+
+  # Let`s now see if the TAP directive has been escaped.  For example:
+  #  escaped:     ok \# SKIP
+  #  not escaped: ok \\# SKIP
+  #  escaped:     ok \\\\\# SKIP
+  #  not escaped: ok \ # SKIP
+  if (substr(line, pos, 1) == "#")
+    {
+      bslash_count = 0
+      for (i = pos; i > 1 && substr(line, i - 1, 1) == "\\"; i--)
+        bslash_count += 1
+      if (bslash_count % 2)
+        return # Directive was escaped.
+    }
+
+  # Strip the directive and its explanation (if any) from the test
+  # description.
+  result_obj["description"] = substr(line, 1, pos - 1)
+  # Now remove the test description from the line, that has been dealt
+  # with already.
+  line = substr(line, pos)
+  # Strip the directive, and save its value (normalized to upper case).
+  sub("^[ \t]*#[ \t]*", "", line)
+  result_obj["directive"] = toupper(substr(line, 1, 4))
+  line = substr(line, 5)
+  # Now get the explanation for the directive (if any), with leading
+  # and trailing whitespace removed.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+  result_obj["explanation"] = line
+}
+
+function get_test_exit_message(status)
+{
+  if (status == 0)
+    return ""
+  if (status !~ /^[1-9][0-9]*$/)
+    abort("getting exit status")
+  if (status < 127)
+    exit_details = ""
+  else if (status == 127)
+    exit_details = " (command not found?)"
+  else if (status >= 128 && status <= 255)
+    exit_details = sprintf(" (terminated by signal %d?)", status - 128)
+  else if (status > 256 && status <= 384)
+    # We used to report an "abnormal termination" here, but some Korn
+    # shells, when a child process die due to signal number n, can leave
+    # in $? an exit status of 256+n instead of the more standard 128+n.
+    # Apparently, both behaviours are allowed by POSIX (2008), so be
+    # prepared to handle them both.  See also Austing Group report ID
+    # 0000051 <http://www.austingroupbugs.net/view.php?id=51>
+    exit_details = sprintf(" (terminated by signal %d?)", status - 256)
+  else
+    # Never seen in practice.
+    exit_details = " (abnormal termination)"
+  return sprintf("exited with status %d%s", status, exit_details)
+}
+
+function write_test_results()
+{
+  print ":global-test-result: " get_global_test_result() > trs_file
+  print ":recheck: "  yn(must_recheck()) > trs_file
+  print ":copy-in-global-log: " yn(copy_in_global_log()) > trs_file
+  for (i = 0; i < test_results_index; i += 1)
+    print ":test-result: " test_results_list[i] > trs_file
+  close(trs_file);
+}
+
+BEGIN {
+
+## ------- ##
+##  SETUP  ##
+## ------- ##
+
+'"$init_colors"'
+
+# Properly initialized once the TAP plan is seen.
+planned_tests = 0
+
+COOKED_PASS = expect_failure ? "XPASS": "PASS";
+COOKED_FAIL = expect_failure ? "XFAIL": "FAIL";
+
+# Enumeration-like constants to remember which kind of plan (if any)
+# has been seen.  It is important that NO_PLAN evaluates "false" as
+# a boolean.
+NO_PLAN = 0
+EARLY_PLAN = 1
+LATE_PLAN = 2
+
+testno = 0     # Number of test results seen so far.
+bailed_out = 0 # Whether a "Bail out!" directive has been seen.
+
+# Whether the TAP plan has been seen or not, and if yes, which kind
+# it is ("early" is seen before any test result, "late" otherwise).
+plan_seen = NO_PLAN
+
+## --------- ##
+##  PARSING  ##
+## --------- ##
+
+is_first_read = 1
+
+while (1)
+  {
+    # Involutions required so that we are able to read the exit status
+    # from the last input line.
+    st = getline
+    if (st < 0) # I/O error.
+      fatal("I/O error while reading from input stream")
+    else if (st == 0) # End-of-input
+      {
+        if (is_first_read)
+          abort("in input loop: only one input line")
+        break
+      }
+    if (is_first_read)
+      {
+        is_first_read = 0
+        nextline = $0
+        continue
+      }
+    else
+      {
+        curline = nextline
+        nextline = $0
+        $0 = curline
+      }
+    # Copy any input line verbatim into the log file.
+    print | "cat >&3"
+    # Parsing of TAP input should stop after a "Bail out!" directive.
+    if (bailed_out)
+      continue
+
+    # TAP test result.
+    if ($0 ~ /^(not )?ok$/ || $0 ~ /^(not )?ok[^a-zA-Z0-9_]/)
+      {
+        testno += 1
+        setup_result_obj($0)
+        handle_tap_result()
+      }
+    # TAP plan (normal or "SKIP" without explanation).
+    else if ($0 ~ /^1\.\.[0-9]+[ \t]*$/)
+      {
+        # The next two lines will put the number of planned tests in $0.
+        sub("^1\\.\\.", "")
+        sub("[^0-9]*$", "")
+        handle_tap_plan($0, "")
+        continue
+      }
+    # TAP "SKIP" plan, with an explanation.
+    else if ($0 ~ /^1\.\.0+[ \t]*#/)
+      {
+        # The next lines will put the skip explanation in $0, stripping
+        # any leading and trailing whitespace.  This is a little more
+        # tricky in truth, since we want to also strip a potential leading
+        # "SKIP" string from the message.
+        sub("^[^#]*#[ \t]*(SKIP[: \t][ \t]*)?", "")
+        sub("[ \t]*$", "");
+        handle_tap_plan(0, $0)
+      }
+    # "Bail out!" magic.
+    # Older versions of prove and TAP::Harness (e.g., 3.17) did not
+    # recognize a "Bail out!" directive when preceded by leading
+    # whitespace, but more modern versions (e.g., 3.23) do.  So we
+    # emulate the latter, "more modern" behaviour.
+    else if ($0 ~ /^[ \t]*Bail out!/)
+      {
+        bailed_out = 1
+        # Get the bailout message (if any), with leading and trailing
+        # whitespace stripped.  The message remains stored in `$0`.
+        sub("^[ \t]*Bail out![ \t]*", "");
+        sub("[ \t]*$", "");
+        # Format the error message for the
+        bailout_message = "Bail out!"
+        if (length($0))
+          bailout_message = bailout_message " " $0
+        testsuite_error(bailout_message)
+      }
+    # Maybe we have too look for dianogtic comments too.
+    else if (comments != 0)
+      {
+        comment = extract_tap_comment($0);
+        if (length(comment))
+          report("#", comment);
+      }
+  }
+
+## -------- ##
+##  FINISH  ##
+## -------- ##
+
+# A "Bail out!" directive should cause us to ignore any following TAP
+# error, as well as a non-zero exit status from the TAP producer.
+if (!bailed_out)
+  {
+    if (!plan_seen)
+      {
+        testsuite_error("missing test plan")
+      }
+    else if (planned_tests != testno)
+      {
+        bad_amount = testno > planned_tests ? "many" : "few"
+        testsuite_error(sprintf("too %s tests run (expected %d, got %d)",
+                                bad_amount, planned_tests, testno))
+      }
+    if (!ignore_exit)
+      {
+        # Fetch exit status from the last line.
+        exit_message = get_test_exit_message(nextline)
+        if (exit_message)
+          testsuite_error(exit_message)
+      }
+  }
+
+write_test_results()
+
+exit 0
+
+} # End of "BEGIN" block.
+'
+
+# TODO: document that we consume the file descriptor 3 :-(
+} 3>"$log_file"
+
+test $? -eq 0 || fatal "I/O or internal error"
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/config.h.in b/config.h.in
new file mode 100644
index 0000000..5fa9546
--- /dev/null
+++ b/config.h.in
@@ -0,0 +1,89 @@
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Defined if the requested minimum BOOST version is satisfied */
+#undef HAVE_BOOST
+
+/* Define to 1 if you have <boost/scoped_ptr.hpp> */
+#undef HAVE_BOOST_SCOPED_PTR_HPP
+
+/* Define to 1 if you have <boost/shared_ptr.hpp> */
+#undef HAVE_BOOST_SHARED_PTR_HPP
+
+/* Define to 1 if you have <boost/system/error_code.hpp> */
+#undef HAVE_BOOST_SYSTEM_ERROR_CODE_HPP
+
+/* Define to 1 if you have <boost/thread.hpp> */
+#undef HAVE_BOOST_THREAD_HPP
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Set to 1 if you have the "llvm/Analysis/Verifier.h" header file */
+#undef HAVE_LLVM_ANALYSIS_VERIFIER_H
+
+/* Set to 1 if you have the llvm::DataLayoutPass class */
+#undef HAVE_LLVM_DATA_LAYOUT_PASS
+
+/* Set to 1 if you have the "llvm/IR/Verifier.h" header file */
+#undef HAVE_LLVM_IR_VERIFIER_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Package copyright */
+#undef PACKAGE_COPYRIGHT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#undef VERSION
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..46bd45b
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,94 @@
+AC_INIT([lib6502-jit], [1.0], [lib6502-jit@lemma.co.uk])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_MACRO_DIR([m4])
+AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects no-dist-gzip dist-bzip2])
+AM_MAINTAINER_MODE([enable])
+LT_INIT([disable-shared])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES([Makefile])
+AC_REQUIRE_AUX_FILE([tap-driver.sh])
+
+# Copyright for configure.ac *only*
+AC_COPYRIGHT([Copyright (c) 2014 Steven Flintham])
+
+AC_DEFINE([PACKAGE_COPYRIGHT], ["(C) - see COPYING"], [Package copyright])
+
+# for tap-driver.sh
+AC_PROG_AWK
+
+AC_PROG_CC
+AC_PROG_CXX
+
+BOOST_REQUIRE
+BOOST_SMART_PTR
+BOOST_THREAD
+
+# I want to:
+# - use "llvm-config" (relying on PATH) if the user doesn't do anything 
+#   special, but
+# - allow the user to say --with-llvm-config=XXX to use XXX instead of 
+#   llvm-config, where XXX might need to be found on the PATH (e.g. if 
+#   the program is called llvm-config-3.5) or might be an absolute/
+#   relative filename
+# In both of the above cases, I want to actually check explicitly the 
+# llvm-config program can be found. This doesn't seem to be supported by 
+# autoconf:
+# - AC_CHECK_PROG() and AC_PATH_PROG() both insist on the program name being a 
+#   leaf name with no included path.
+# - AC_CHECK_FILE() (not unreasonably) doesn't look on PATH for the file
+#   (and wouldn't check for executability)
+# So I have to just hack it with "which" and hope.
+AC_ARG_WITH(
+	[llvm-config], 
+	[AS_HELP_STRING(
+		[--with-llvm-config=FILE], 
+		[filename of llvm-config executable (if not on PATH)])], 
+	[LLVMCONFIG="$withval"], 
+	[LLVMCONFIG="llvm-config"])
+echo -n "checking for $LLVMCONFIG... "
+AS_IF(
+	[which "$LLVMCONFIG" >/dev/null],
+	[echo yes],
+	[echo no
+	 AC_MSG_ERROR([llvm-config not found; try --with-llvm-config=FILE?])])
+
+AC_SUBST(LLVMCONFIG)
+
+# These variables are sacred to the user. But we need to set them in order for
+# configure's test programs to find the LLVM headers. I am probably doing this
+# completely wrong. In twenty years or so maybe I will achieve auto-enlightenment
+# and look back at this and laugh.
+SACRED_CPPFLAGS="$CPPFLAGS"
+SACRED_CXXFLAGS="$CXXFLAGS"
+
+CPPFLAGS=["`$LLVMCONFIG --cppflags` $CPPFLAGS"]
+CXXFLAGS=["`$LLVMCONFIG --cxxflags` -fexceptions $CXXFLAGS"]
+
+AC_LANG(C++)
+
+# This header moves around a bit, check for the two known possible locations.
+
+AC_CHECK_HEADER(
+	[llvm/IR/Verifier.h], 
+	[AC_DEFINE([HAVE_LLVM_IR_VERIFIER_H], 1, [Set to 1 if you have the "llvm/IR/Verifier.h" header file])])
+AC_CHECK_HEADER(
+	[llvm/Analysis/Verifier.h], 
+	[AC_DEFINE([HAVE_LLVM_ANALYSIS_VERIFIER_H], 1, [Set to 1 if you have the "llvm/Analysis/Verifier.h" header file])])
+# TODO: Can I get configure to fail if neither of the previous tests
+# succeeds? Otherwise configure will succeed but the build will fail.
+
+# This header always exists, but DataLayoutPass isn't always present.
+AC_CHECK_HEADER(
+	[llvm/IR/DataLayout.h],
+	[],
+	[AC_MSG_ERROR([llvm/IR/DataLayout.h not found])])
+AC_CHECK_TYPE(
+	[llvm::DataLayoutPass],
+	[AC_DEFINE([HAVE_LLVM_DATA_LAYOUT_PASS], 1, [Set to 1 if you have the llvm::DataLayoutPass class])],
+	[],
+	[#include "llvm/IR/DataLayout.h"])
+
+CPPFLAGS="$SACRED_CPPFLAGS"
+CXXFLAGS="$SACRED_CXXFLAGS"
+
+AC_OUTPUT
diff --git a/const.h b/const.h
new file mode 100644
index 0000000..c2bbdfd
--- /dev/null
+++ b/const.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef CONST_H
+#define CONST_H
+
+#include <stdint.h>
+
+namespace
+{
+    const uint8_t opcode_brk = 0x00;
+    const uint8_t opcode_rti = 0x40;
+    const uint8_t opcode_rts = 0x60;
+    const uint8_t opcode_bra = 0x80;
+    const uint8_t opcode_bcc = 0x90;
+    const uint8_t opcode_bcs = 0xb0;
+    const uint8_t opcode_bvc = 0x50;
+    const uint8_t opcode_bvs = 0x70;
+    const uint8_t opcode_beq = 0xf0;
+    const uint8_t opcode_bne = 0xd0;
+    const uint8_t opcode_bpl = 0x10;
+    const uint8_t opcode_bmi = 0x30;
+    const uint8_t opcode_jsr = 0x20;
+    const uint8_t opcode_jmp_abs = 0x4c;
+    const uint8_t opcode_jmp_ind_abs = 0x6c;
+    const uint8_t opcode_jmp_indx_abs = 0x7c;
+
+    enum {
+      flagN= (1<<7),	/* negative 	 */
+      flagV= (1<<6),	/* overflow 	 */
+      flagX= (1<<5),	/* unused   	 */
+      flagB= (1<<4),	/* irq from brk  */
+      flagD= (1<<3),	/* decimal mode  */
+      flagI= (1<<2),	/* irq disable   */
+      flagZ= (1<<1),	/* zero          */
+      flagC= (1<<0)	/* carry         */
+    };
+    
+    const uint32_t memory_size = 0x10000;
+    const uint16_t stack = 0x100;
+}
+
+#endif
diff --git a/examples/README b/examples/README
new file mode 100644
index 0000000..e22418c
--- /dev/null
+++ b/examples/README
@@ -0,0 +1,406 @@
+lib6502 - 6502 Microprocessor Emulator
+
+EXAMPLES
+
+  This file has three sections:
+
+    1. PROGRAMS that you can compile and run
+    2. COMMANDS that you can copy and paste into a terminal
+    3. ADVANCED stuff that requires some additional setup
+
+  A few numbered footnotes appear at the end and are referenced in the
+  text in square brackets [6].
+
+----------------------------------------------------------------
+
+1.  PROGRAMS
+
+  (We're going to start in 'serious mode'.  Bear with me.)
+
+  The file 'lib1.c' contains the example from the run6502 manual page.
+  Just compile and run it:
+
+        cc -o lib1 lib1.c
+        ./lib1
+
+  The file has been commented extensively to explain exactly what is
+  going on.
+
+----------------------------------------------------------------
+
+2.  COMMANDS
+
+  (Much more fun: this is the section that appeals to the geek in me.)
+
+  6502 machine code is pretty straightforward.  (Many 6502 programmers
+  remember a time from their misguided childhood when they could
+  compose and edit programs directly in hexadecimal using their 'front
+  panel' monitor program -- the next best thing to programming with a
+  row of switches and lamps, but I digress and will leave that story
+  until the pdp11 emulator is ready. ;-)  We can use this fact to
+  generate an entire program without needing an assembler.  The 'perl'
+  program is available on most Unixy (and several other) systems and
+  makes it easy to create binary files from a string of hex digits.
+  (There is a program called 'xxd' that's very good at this kind of
+  thing, but you might not have it.)
+
+  First the program (stolen from lib1.c):
+
+        1000    ldx #41         A241
+        1002    txa             8A
+        1003    jsr FFEE        20EEFF
+        1006    inx             E8
+        1007    cpx #5B         E05B
+        1009    bne 1002        D0F7
+        100B    lda #0A         A90A
+        100D    jsr FFEE        20EEFF
+        1010    brk             00
+
+  In C-like syntax it is equivalent to:
+
+        regX = 'A';
+        do {
+          regA = regX;
+          putchar(regA);
+        } while (regX != 'Z' + 1);
+        putchar('\n');
+
+  (which by today's standards is a *huge* amount of stuff packed into
+  just 17 bytes of 'compiled' code -- on a 386 the same program is
+  around 65 bytes [1], and more like 88 bytes on a 32-bit RISC [2]).
+
+  The column on the right is the machine code in hexadecimal.  When
+  strung out in a line it looks like this:
+
+        A2418A20EEFFE8E05BD0F7A90A20EEFF00
+
+  We can tell perl to 'pack' this hexadecimal string into binary and
+  save the output in a file:
+
+        echo A2418A20EEFFE8E05BD0F7A90A20EEFF00 |
+        perl -e 'print pack "H*",<STDIN>' > temp.img
+
+  To check the contents of the file, we can load it into run6502 and
+  then disassemble it:
+
+        run6502 -l 1000 temp.img -d 1000 +11 -x
+
+  The '-l 1000 temp.img' loads the file into the 6502's memory at
+  address 0x1000, and the '-d 1000 +11' disassembles 17 bytes (11 in
+  hex) of code starting at 0x1000.  The final '-x' tells run6502 not
+  to try to execute the code.  The output should look just like the
+  program listing above.
+
+  This is almost all we need to run it; just a few details remain.
+
+    - The emulator doesn't know where to start execution.  We need to
+      set the 'reset' vector to 0x1000 -- the address of the first
+      instruction in the program.  The '-R 1000' option does this.
+
+    - The program calls the 'putchar' function at address 0xFFEE to
+      send a character to the terminal.  run6502 can emulate this for
+      us, with the '-P FFEE' option.
+
+    - We have to have some way to make the processor stop execution
+      (there is no 'halt' instruction on the 6502, at least not the
+      early versions).  The trick is in the last instruction 'BRK',
+      that generates a 'software interrupt' -- eventually jumping to
+      the addres in the 'interrupt vector'.  If we don't set the
+      interrupt vector explicitly it remains empty (zero) and BRK will
+      try to transfer control to address 0.  The '-X 0' option tells
+      run6502 to stop executing if/when the program attempts to
+      transfer control to address 0 -- which it will, when it executes
+      the 'BRK' instruction with an empty interrupt vector.  QED :-)
+
+  Here, then, is the complete command to run our program:
+
+        run6502 -l 1000 temp.img -R 1000 -P FFEE -X 0
+
+  This program is relocatable.  You can load it at address 4321
+  (change both the -l and -R options) and it will work just fine.
+
+  Google for "6502 Reference Card" (with the quotes), grab a pencil
+  and paper, and you can start writing 6502 programs immediately!  (If
+  you really want to experience what it was like in the late 1970s,
+  but without the added fun of entering each hex digit one at a time
+  into a monitor program, simply avoid the temptation ever to look at
+  your hand-assembled code with the '-d' option. ;-)
+
+  If you really start liking this and want to write longer programs in
+  text files with the hex split over many lines, you'll need a perl
+  script that can deal with newlines in the input.  Something like
+  this should do the trick...
+
+        #!/usr/bin/perl
+
+        while (<STDIN>) {
+          chomp;
+          print pack "H*", $_
+        }
+
+  (This script is included in the 'examples' directory, in a file
+  called 'hex2bin', to save you 15 seconds of copy and paste.)
+
+  Need a fun project?  Write a 6502 assembler... in 6502 machine code,
+  of course!  Read in the assembly language text via 'getchar' (see
+  the '-G' option) and write out the assembled binary via 'putchar'
+  (the '-P' option, that we've already seen).  Soon you'll be able to:
+
+        cat prog.s |
+        run6502 -l 1000 asm.img -R 1000 -G FFE0 -P FFEE -X 0 > prog.img
+
+        run6502 -l 1000 prog.img -R 1000 -G FFE0 -P FFEE -X 0
+
+  (The first prog.s you write should probably be the assembler itself,
+  transcribed from the paper copy used to hand-assemble the assembler
+  binary.  This significant milestone can be reached with a
+  surprisingly simple assembler.  After this pivotal moment the
+  assembler, assembling itself, can very quickly become very
+  powerful.)
+
+----------------------------------------------------------------
+
+3. ADVANCED
+
+  (Official justification: let's run something big and non-trivial.
+  More likely: a flimsy excuse for a trip down memory lane.)
+
+  The remaining examples assume that you have access to two ROM images
+  from the Acorn 'BBC Model B' microcomputer: the operating system and
+  the BASIC language .  (Just crawl into the attic, fire up the old
+  Beeb, '*SAVE' the images into files, and then transfer them to your
+  Unix box over RS423.  Under no circumstances should you google for
+  'Acorn BBC B OS ROMs zip', without the quotes.  That would be
+  naughty, and probably illegal -- at least until the glorious day
+  when the revolution finally comes.)
+
+  After brushing yourself down (the attic is kind of dusty, no?) save
+  the two ROM images as 'OS12.ROM' and 'BASIC2.ROM'.
+
+  The first thing we can do is use run6502 as an editor to merge the
+  two ROMs into a single image file:
+
+        run6502                         \
+          -l C000 OS12.ROM              \
+          -l 8000 BASIC2.ROM            \
+          -s 0000 +10000 bbc.img        \
+          -x
+
+  (This is a single command, with '\' continuation characters joining
+  the lines into one.  Your shell should figure it out if you just
+  copy and paste.)  It leaves a file 'bbc.img' containing both the OS
+  and BASIC.
+
+  To run this image we need the '-B' option.  It enables some minimal,
+  totally lame, hardware emulation of the BBC computer -- just enough
+  to boot the 'virtual beeb' into BASIC [3]:
+
+        run6502 -l 0 bbc.img -B
+
+  If all goes well, you should be greeted with a 'beep' and a message
+  telling you what computer you have (BBC Computer), how much RAM is
+  available (32K), the language you've been dropped into (BASIC), and
+  a '>' prompt.  Turn on 'CAPS LOCK' (many of us remember those days,
+  and some of us even used to speak in ALL CAPS) and play:
+
+        PRINT 3+4
+
+  or maybe:
+
+        10 FOR A%=1 TO 10
+        20 PRINT A%
+        30 NEXT
+        LIST
+        RUN
+
+  or even:
+
+         10 P%=&2800
+         20 O%=P%
+         30 [
+         40    opt3
+         50    lda #10
+         60    jsr &FFEE
+         70    ldx #65
+         80 .l txa
+         90    jsr &FFEE
+        100    inx
+        110    cpx #91
+        120    bne l
+        130    lda #10
+        140    jmp &FFEE
+        150 ]
+        160 CALL &2800
+        LIST
+        RUN
+
+  (How cool is that? ;-)
+
+  One final thing: there is an option '-i' that works just like '-l'
+  except that it looks to see if the image file begins with '#!'.  If
+  so, it skips over the first line of the file, up to and including
+  the first newline.  Why?  The system call that executes programs on
+  Unixy systems makes the same check.  If the user executes a text
+  file 'foo' staring with '#!prog ...' then the OS loads and runs
+  'prog' instead, passing all the '...'s and the name of the text file
+  'foo' as arguments [4].  If you have 'temp.img' left over from from
+  the second example, open it in a text editor and add a single line
+  at the beginning that reads:
+
+        #!run6502 -i 1000
+
+  (If 'run6502' is not in your current working directory then you will
+  have to use the full path to the file: '#!/usr/bin/run6502' or
+  '#!/usr/local/bin/6502' or whatever.  No spaces before the '#'!)
+
+  Now make the image executable:
+
+        chmod +x temp.img
+
+  and then (as if you hadn't already guessed) execute it:
+
+        ./temp.img
+
+  Saves an awful lot of tedious typing. [5]
+
+  Have fun!
+
+----------------------------------------------------------------
+
+FOOTNOTES
+
+
+[1] Here is the 'alphabet' program, verbatim, compiled (with
+    optimisation) on a 386.  It's 66 bytes long, almost four times
+    longer than the 6502 version.  (If I were more generous I might
+    consider that fair: 32 bits divided by 8 bits is four.)
+
+       0:   55                      push   %ebp
+       1:   89 e5                   mov    %esp,%ebp
+       3:   53                      push   %ebx
+       4:   83 ec 14                sub    $0x14,%esp
+       7:   bb 41 00 00 00          mov    $0x41,%ebx
+       c:   a1 00 00 00 00          mov    0x0,%eax
+      11:   89 44 24 04             mov    %eax,0x4(%esp)
+      15:   89 1c 24                mov    %ebx,(%esp)
+      18:   e8 fc ff ff ff          call   19 <fputc>
+      1d:   43                      inc    %ebx
+      1e:   83 fb 5b                cmp    $0x5b,%ebx
+      21:   75 e9                   jne    c <prog+0xc>
+      23:   a1 00 00 00 00          mov    0x0,%eax
+      28:   89 44 24 04             mov    %eax,0x4(%esp)
+      2c:   c7 04 24 0a 00 00 00    movl   $0xa,(%esp)
+      33:   e8 fc ff ff ff          call   34 <fputc>
+      38:   b8 00 00 00 00          mov    $0x0,%eax
+      3d:   83 c4 14                add    $0x14,%esp
+      40:   5b                      pop    %ebx
+      41:   5d                      pop    %ebp
+      42:   c3                      ret    
+
+
+[2] Here is the 'alphabet' program, verbatim, compiled (with
+    optimisation) on a PowerPC.  It's 88 bytes long, more than five
+    times longer than the 6502 version.  (I don't care what you say:
+    Apple Macs rule and mine has oodles of RAM to spare.)
+
+    00000000        mfspr   r0,lr
+    00000004        stmw    r29,0xfff4(r1)
+    00000008        stw     r0,0x8(r1)
+    0000000c        stwu    r1,0xffb0(r1)
+    00000010        bcl     20,31,0x14
+    00000014        mfspr   r31,lr
+    00000018        li      r30,0x41
+    0000001c        addis   r2,r31,ha16(0xa4-0x14)
+    00000020        lwz     r29,lo16(0xa4-0x14)(r2)
+    00000024        or      r3,r30,r30
+    00000028        addi    r4,r29,0x58
+    0000002c        bl      0x7c    ; symbol stub for: _fputc
+    00000030        cmpwi   cr7,r30,0x5a
+    00000034        addi    r30,r30,0x1
+    00000038        bne     cr7,0x24
+    0000003c        li      r3,0xa
+    00000040        bl      0x5c    ; symbol stub for: _fputc
+    00000044        li      r3,0x0
+    00000048        lwz     r0,0x58(r1)
+    0000004c        addi    r1,r1,0x50
+    00000050        mtspr   lr,r0
+    00000054        lmw     r29,0xfff4(r1)
+    00000058        blr
+
+
+[3] Time to 'fess up with an undocumented 'feature'.  We ran our
+    'bbc.img' file like this:
+
+        run6502 -l 0 bbc.img -B
+
+    I grew tired of typing all those '-'s and made run6502 check to
+    see if it was invoked with a single, non-option argument.
+    Running:
+
+        run6502 bbc.img
+
+    is precisely equivalent to the '-l -B' form above.  I don't feel
+    too guilty about this since the manual page suggests that
+    providing a single, non-option argument is illegal usage.
+
+
+[4] Okay, that might be a little confusing.  Here it is written out in
+    full.  If you have a text file called 'foo' containing
+
+        #!/usr/bin/prog -gobble
+        blah blah blah
+        blah blah blah
+
+    that is executable, and then you execute it like a compiled
+    program
+
+        ./foo
+
+    then the OS will notice the '#!' and run the following command
+    instead:
+
+        /usr/bin/prog -gobble ./foo
+
+    The '-gobble' tells 'prog' to eat the first line, leaving just the
+    blah that follows.  (The reason for choosing '#!' is that '#' is
+    the comment character in the standard Unix shell, with the obvious
+    happy consequences for shell scripts.)
+
+
+[5] We can play the same '#!' game with our 'bbc.img' file.  Open it
+    up and add the line
+
+        #!/usr/local/bin/run6502 -B -l 0
+
+    (or whatever, according to the location of the 'run6502' program),
+    make it executable
+
+        chmod +x bbc.img
+
+    and execute it:
+
+        ./bbc.img
+
+    To save a whopping 32K of zeros at the beginning of the file,
+    create the image again with
+
+        run6502                 \
+          -l C000 OS12.ROM      \
+          -l 8000 BASIC2.ROM    \
+          -s 8000 +8000 bbc.img \
+          -x
+
+    and run it with
+
+        run6502 -l 0 bbc.img -B
+
+    and, if you like, insert the single line
+
+        #!/usr/local/bin/run6502 -B -l 8000
+
+    at the start of the image file and make it executable:
+
+        ./bbc.img
+
+
+[6] There is no footnote 6.
diff --git a/examples/hex2bin b/examples/hex2bin
new file mode 100755
index 0000000..82c2a44
--- /dev/null
+++ b/examples/hex2bin
@@ -0,0 +1,6 @@
+#!/usr/bin/perl
+
+while (<STDIN>) {
+  chomp;
+  print pack "H*", $_
+}
diff --git a/examples/lib1.c b/examples/lib1.c
new file mode 100644
index 0000000..6b89520
--- /dev/null
+++ b/examples/lib1.c
@@ -0,0 +1,108 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+
+/* Emulated OS functions. */
+
+#define WRCH	0xFFEE	/* Write accumulator to stdout. */
+
+/* Write the accumulator to stdout.  This function will be invoked
+ * when the emulated program calls 0xFFEE.
+ */
+int wrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  int pc;
+
+  /* Write the character.
+   */
+  putchar(mpu->registers->a);
+
+  /* We arrived here from a JSR instruction.  The stack contains the
+   * saved PC.  Pop it off the stack.
+   */
+  pc  = mpu->memory[++mpu->registers->s + 0x100];
+  pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;
+
+  /* The JSR instruction pushes the value of PC before it has been
+   * incremented to point to the instruction after the JSR.  Return PC
+   * + 1 as the address for the next insn.  Returning non-zero
+   * indicates that we handled the 'subroutine' ourselves, and the
+   * emulator should pretend the original 'JSR' neveer happened at
+   * all.
+   */
+  return pc + 1;  /* JSR pushes next insn addr - 1 */
+}
+
+
+/* Exit gracefully.  We arrange for this function to be called when
+ * the emulator tries to transfer control to address 0.
+ */
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+
+  /* Dump the internal state of the processor.
+   */
+  M6502_dump(mpu, buffer);
+
+  /* Print a cute message and quit.
+   */
+  printf("\nBRK instruction\n%s\n", buffer);
+  exit(0);
+}
+
+int main()
+{
+  M6502    *mpu = M6502_new(0, 0, 0);	/* Make a 6502 */
+  unsigned  pc  = 0x1000;		/* PC for 'assembly' */
+
+  /* Install the two callback functions defined above.
+   */
+  M6502_setCallback(mpu, call, WRCH, wrch);	/* Calling FFEE -> wrch() */
+  M6502_setCallback(mpu, call,    0, done);	/* Calling 0 -> done() */
+
+  /* A few macros that dump bytes into the 6502's memory.
+   */
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  /* Hand-assemble the program.
+   */
+  gen2(0xA2, 'A'     );	// LDX #'A'
+  gen1(0x8A          );	// TXA
+  gen3(0x20,0xEE,0xFF);	// JSR FFEE
+  gen1(0xE8          );	// INX
+  gen2(0xE0, 'Z'+1   );	// CPX #'Z'+1
+  gen2(0xD0, -9      );	// BNE 0x1002
+  gen2(0xA9, '\n'    );	// LDA #'\n'
+  gen3(0x20,0xEE,0xFF);	// JSR FFEE
+  gen2(0x00,0x00     ); // BRK
+
+  /* Just for fun: disssemble the program.
+   */
+  {
+    char     insn[64];
+    uint16_t ip= 0x1000;
+    while (ip < pc)
+      {
+	int isz = M6502_disassemble(mpu, ip, insn);
+	printf("%04X %s\n", ip, insn);
+	ip += isz;
+      }
+  }
+
+  /* Point the RESET vector at the first instruction in the assembled
+   * program.
+   */
+  M6502_setVector(mpu, RST, 0x1000);
+
+  /* Reset the 6502 and run the program.
+   */
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/lib6502-compatibility.txt b/lib6502-compatibility.txt
new file mode 100644
index 0000000..23f88d2
--- /dev/null
+++ b/lib6502-compatibility.txt
@@ -0,0 +1,54 @@
+At the time of writing the latest lib6502 release is v1.3; older versions are
+not considered here.
+
+Some things which work fine with lib6502 itself are not supported when using
+lib6502-jit in hybrid (the default) or compiled execution modes. All of the
+following will result in undefined behaviour unless interpreted mode is used:
+
+* Modifying memory which contains 6502 code (whether executed yet or not)
+  inside a read callback. (All other types of callbacks are allowed to
+  modify memory freely, including modifying code.)
+
+* Defining a callback after calling M6502_run(); for example, doing so inside
+  another callback.
+
+* Checking the B and X flags in the processor status register
+  (M6502_Registers.p) inside a callback. lib6502 tracks these flags as if they
+  have a real existence at all times. lib6502-jit's compiler only sets them
+  appropriately when pushing a copy of the processor status register onto the
+  stack. This difference is *not* visible to code executing on the emulated CPU,
+  only to callbacks. In hybrid mode, which behaviour you get will depend on
+  whether your callback is invoked from the interpreter or compiled code.
+
+The following differences exist between lib6502 and lib6502-jit in all modes,
+including interpreted mode:
+
+* lib6502 is likely to be slightly faster than lib6502-jit in interpreted mode,
+  since the latter's interpreter code contains additional tests to stop
+  executing at certain points after n instructions have been executed.
+
+* Illegal instructions are treated as no-ops by default in lib6502-jit; lib6502
+  aborts if an illegal instruction is executed.
+
+* Illegal instruction callbacks are a lib6502-jit extension and are not
+  available in lib6502.
+
+* Call callbacks in lib6502 always receive a 0 as the data argument;
+  lib6502-jit supplies the opcode triggering the callback as the data argument.
+
+* A few bugs in lib6502's emulation are resolved in lib6502-jit:
+  - BRK clears the D flag
+  - ADC/SBC exactly match the behaviour of a real 65C02 in decimal mode
+  - BIT #imm only modifies the Z flag, leaving N and V untouched
+  - TSB sets the Z flag correctly
+  - TRB sets the Z flag and updates memory correctly
+
+* lib6502's run6502 -B option skips every other (ROM name) argument;
+  lib6502-jit's doesn't.
+
+lib6502-jit's stance is that anything the code executing on the emulated CPU
+does is fair game and must be handled, but that the library's client code has a
+responsibility to cooperate and not do tricky things like those documented
+above. If you have what you think is a reasonable requirement for behaviour
+which is supported by lib6502 but doesn't work on lib6502-jit please get in
+touch.
diff --git a/lib6502-jit.cpp b/lib6502-jit.cpp
new file mode 100644
index 0000000..02da212
--- /dev/null
+++ b/lib6502-jit.cpp
@@ -0,0 +1,190 @@
+/* lib6502-jit.cpp -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "const.h"
+#include "Function.h"
+#include "FunctionBuilder.h"
+#include "FunctionManager.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+static void outOfMemory(void)
+{
+    die("out of memory");
+}
+
+M6502 *M6502_new(M6502_Registers *registers, M6502_Memory memory, M6502_Callbacks *callbacks)
+{
+  M6502 *mpu= (M6502 *) calloc(1, sizeof(M6502));
+  if (!mpu) outOfMemory();
+
+  if (!registers)  { registers = (M6502_Registers *)calloc(1, sizeof(M6502_Registers));  mpu->flags |= M6502_RegistersAllocated; }
+  if (!memory   )  { memory    = (uint8_t         *)calloc(1, sizeof(M6502_Memory   ));  mpu->flags |= M6502_MemoryAllocated;    }
+  if (!callbacks)  { callbacks = (M6502_Callbacks *)calloc(1, sizeof(M6502_Callbacks));  mpu->flags |= M6502_CallbacksAllocated; }
+
+  if (!registers || !memory || !callbacks) outOfMemory();
+
+  mpu->registers = registers;
+  mpu->memory    = memory;
+  mpu->callbacks = callbacks;
+
+  try
+  {
+    mpu->internal = new _M6502_Internal(mpu);
+  }
+  catch (std::exception &e)
+  {
+    die(e.what());
+  }
+
+  return mpu;
+}
+ 
+void M6502_delete(M6502 *mpu)
+{
+  if (mpu->flags & M6502_CallbacksAllocated) free(mpu->callbacks);
+  if (mpu->flags & M6502_MemoryAllocated   ) free(mpu->memory);
+  if (mpu->flags & M6502_RegistersAllocated) free(mpu->registers);
+  delete mpu->internal;
+
+  free(mpu);
+}
+
+void M6502_setMode(M6502 *mpu, M6502_Mode mode, int arg)
+{
+    mpu->internal->mode_ = mode;
+
+    if (arg == 0)
+    {
+        arg = M6502_Internal::default_max_instructions_;
+    }
+    mpu->internal->max_instructions_ = arg;
+} 
+
+extern "C" void M6502_run_interpreted(M6502 *mpu, int instructions_left);
+
+// I don't know if it's "supposed" to work, but it doesn't seem completely
+// unreasonable for a lib6502 client to do a setjmp() before invoking
+// M6502_run() and have a callback function longjmp() out of the emulation. I
+// believe this will work with lib6502 itself, and I would like this emulation
+// to do the same.  (Note that currently for both lib6502 and lib6502-jit,
+// read/write callbacks don't see an up-to-date M6502_Registers object and so
+// the setjmp/longjmp trick would result in restarting execution in the wrong
+// place with the wrong registers. Call callbacks and illegal instruction
+// callbacks should work though.)
+//
+// To this end, M6502_run_compiled() and M6502_run_hybrid() both update the
+// Registers object from the M6502_Registers object on entry to pick up the
+// current state. They also both ensure they call update_memory_snapshot() as
+// appropriate in case the caller modified memory before invoking M6502_run()
+// again.
+
+static void M6502_run_compiled(M6502 *mpu)
+{
+    FunctionManager &function_manager = mpu->internal->function_manager_;
+    function_manager.update_memory_snapshot();
+
+    Registers &registers = mpu->internal->registers_;
+    registers.from_M6502_Registers(mpu);
+
+    while (true)
+    {
+        Function *f = function_manager.get_function(registers.pc);
+        TRACE("Executing Function object for address 0x" << std::hex <<
+              std::setfill('0') << std::setw(4) << registers.pc);
+        f->execute();
+    }
+}
+
+#ifdef LOG
+
+static std::string M6502_dump_str(M6502 *mpu)
+{
+    char buffer[64];
+    M6502_dump(mpu, buffer);
+    return buffer;
+}
+
+#endif
+
+static void M6502_run_hybrid(M6502 *mpu)
+{
+    FunctionManager &function_manager = mpu->internal->function_manager_;
+    Registers &registers = mpu->internal->registers_;
+    registers.from_M6502_Registers(mpu);
+    TRACE("About to interpret, CPU state: " << M6502_dump_str(mpu));
+    while (true)
+    {
+        const int instructions_to_interpret = 100;
+        M6502_run_interpreted(mpu, instructions_to_interpret);
+        if (function_manager.jit_thread_idle())
+        {
+            TRACE("JIT thread is idle");
+            registers.from_M6502_Registers(mpu);
+            function_manager.update_memory_snapshot();
+            Function *f;
+            while ((f = function_manager.get_function_lazy(registers.pc)) != 0)
+            {
+                TRACE("Executing Function object for address 0x" << std::hex <<
+                      std::setfill('0') << std::setw(4) << registers.pc);
+                f->execute();
+            }
+            TRACE("No Function object available for address 0x" << std::hex <<
+                  std::setfill('0') << std::setw(4) << registers.pc <<
+                  ", falling back to interpreter");
+            registers.to_M6502_Registers(mpu);
+            TRACE("About to interpret, CPU state: " << M6502_dump_str(mpu));
+        }
+    }
+}
+
+void M6502_run(M6502 *mpu)
+{
+    try
+    {
+        switch (mpu->internal->mode_)
+        {
+            case M6502_ModeInterpreted:
+                while (true)
+                {
+                    M6502_run_interpreted(mpu, std::numeric_limits<int>::max());
+                }
+                break;
+
+            case M6502_ModeCompiled:
+                M6502_run_compiled(mpu);
+                break;
+
+            case M6502_ModeHybrid:
+                M6502_run_hybrid(mpu);
+                break;
+
+            default:
+                die("Unknown execution mode in M6502_run()");
+        }
+
+        die("M6502_run() returned!");
+    }
+    catch (std::exception &e)
+    {
+        die(e.what());
+    }
+}
diff --git a/lib6502.c b/lib6502.c
new file mode 100644
index 0000000..866e1b9
--- /dev/null
+++ b/lib6502.c
@@ -0,0 +1,910 @@
+/* lib6502.c -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* BUGS:
+ *   - RTS and RTI do not check the return address for a callback
+ *   - the disassembler cannot be configured to read two bytes for BRK
+ *   - architectural variations (unimplemented/extended instructions) not implemented
+ *   - ANSI versions (from from gcc extensions) of the dispatch macros are missing
+ *   - emulator+disassembler in same object file (library is kind of pointless)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+
+typedef uint8_t  byte;
+typedef uint16_t word;
+
+enum {
+  flagN= (1<<7),	/* negative 	 */
+  flagV= (1<<6),	/* overflow 	 */
+  flagX= (1<<5),	/* unused   	 */
+  flagB= (1<<4),	/* irq from brk  */
+  flagD= (1<<3),	/* decimal mode  */
+  flagI= (1<<2),	/* irq disable   */
+  flagZ= (1<<1),	/* zero          */
+  flagC= (1<<0)		/* carry         */
+};
+
+#define getN()	(P & flagN)
+#define getV()	(P & flagV)
+#define getB()	(P & flagB)
+#define getD()	(P & flagD)
+#define getI()	(P & flagI)
+#define getZ()	(P & flagZ)
+#define getC()	(P & flagC)
+
+#define setNVZC(N,V,Z,C)	(P= (P & ~(flagN | flagV | flagZ | flagC)) | (N) | ((V)<<6) | ((Z)<<1) | (C))
+#define setNZC(N,Z,C)		(P= (P & ~(flagN |         flagZ | flagC)) | (N) |            ((Z)<<1) | (C))
+#define setNZ(N,Z)		(P= (P & ~(flagN |         flagZ        )) | (N) |            ((Z)<<1)      )
+#define setZ(Z)			(P= (P & ~(                flagZ        )) |                  ((Z)<<1)      )
+#define setC(C)			(P= (P & ~(                        flagC)) |                             (C))
+
+#define NAND(P, Q)	(!((P) & (Q)))
+
+#define tick(n)
+#define tickIf(p)
+
+/* memory access (indirect if callback installed) -- ARGUMENTS ARE EVALUATED MORE THAN ONCE! */
+
+#define putMemory(ADDR, BYTE)			\
+  ( writeCallback[ADDR]				\
+      ? writeCallback[ADDR](mpu, ADDR, BYTE)	\
+      : (memory[ADDR]= BYTE) )
+
+#define getMemory(ADDR)				\
+  ( readCallback[ADDR]				\
+      ?  readCallback[ADDR](mpu, ADDR, 0)	\
+      :  memory[ADDR] )
+
+/* stack access (always direct) */
+
+#define push(BYTE)		(memory[0x0100 + S--]= (BYTE))
+#define pop()			(memory[++S + 0x0100])
+
+/* adressing modes (memory access direct) */
+
+#define implied(ticks)				\
+  tick(ticks);
+
+#define immediate(ticks)			\
+  tick(ticks);					\
+  ea= PC++;
+
+#define abs(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC] + (memory[PC + 1] << 8);	\
+  PC += 2;
+
+#define relative(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++];				\
+  if (ea & 0x80) ea -= 0x100;			\
+  tickIf((ea >> 8) != (PC >> 8));
+
+#define indirect(ticks)				\
+  tick(ticks);					\
+  {						\
+    word tmp;					\
+    tmp= memory[PC]  + (memory[PC  + 1] << 8);	\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);	\
+    PC += 2;					\
+  }
+
+#define absx(ticks)						\
+  tick(ticks);							\
+  ea= memory[PC] + (memory[PC + 1] << 8);			\
+  PC += 2;							\
+  tickIf((ticks == 4) && ((ea >> 8) != ((ea + X) >> 8)));	\
+  ea += X;
+
+#define absy(ticks)						\
+  tick(ticks);							\
+  ea= memory[PC] + (memory[PC + 1] << 8);			\
+  PC += 2;							\
+  tickIf((ticks == 4) && ((ea >> 8) != ((ea + Y) >> 8)));	\
+  ea += Y
+
+#define zp(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++];
+
+#define zpx(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++] + X;				\
+  ea &= 0x00ff;
+
+#define zpy(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++] + Y;				\
+  ea &= 0x00ff;
+
+#define indx(ticks)				\
+  tick(ticks);					\
+  {						\
+    byte tmp= memory[PC++] + X;			\
+    ea= memory[tmp] + (memory[tmp + 1] << 8);	\
+  }
+
+#define indy(ticks)						\
+  tick(ticks);							\
+  {								\
+    byte tmp= memory[PC++];					\
+    ea= memory[tmp] + (memory[tmp + 1] << 8);			\
+    tickIf((ticks == 5) && ((ea >> 8) != ((ea + Y) >> 8)));	\
+    ea += Y;							\
+  }
+
+#define indabsx(ticks)					\
+  tick(ticks);						\
+  {							\
+    word tmp;						\
+    tmp= memory[PC ] + (memory[PC  + 1] << 8) + X;	\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);		\
+  }
+
+#define indzp(ticks)					\
+  tick(ticks);						\
+  {							\
+    byte tmp;						\
+    tmp= memory[PC++];					\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);		\
+  }
+
+/* insns */
+
+#define adc(ticks, adrmode)								\
+  adrmode(ticks);									\
+  {											\
+    byte B= getMemory(ea);								\
+    if (!getD())									\
+      {											\
+	int c= A + B + getC();								\
+	int v= (int8_t)A + (int8_t)B + getC();						\
+	fetch();									\
+	A= c;										\
+	setNVZC((A & 0x80), (((A & 0x80) > 0) ^ (v < 0)), (A == 0), ((c & 0x100) > 0));	\
+	next();										\
+      }											\
+    else										\
+      {											\
+	/* Algorithm taken from http://www.6502.org/tutorials/decimal_mode.html */      \
+	/* inelegant & slow, but consistent with the hw for illegal digits */		\
+	int l, s, t, v;									\
+	l= (A & 0x0F) + (B & 0x0F) + getC();						\
+	if (l >= 0x0A) { l = ((l + 0x06) & 0x0F) + 0x10; }				\
+	s= (A & 0xF0) + (B & 0xF0) + l;							\
+	t= (int8_t)(A & 0xF0) + (int8_t)(B & 0xF0) + (int8_t)l;				\
+	v= (t < -128) || (t > 127);							\
+	if (s >= 0xA0) { s += 0x60; }							\
+        fetch();									\
+	A= s;										\
+	/* only C is valid on NMOS 6502 */						\
+	setNVZC(s & 0x80, v, !A, (s >= 0x100));						\
+	tick(1);									\
+	next();										\
+      }											\
+  }
+
+#define sbc(ticks, adrmode)								\
+  adrmode(ticks);									\
+  {											\
+    byte B= getMemory(ea);								\
+    if (!getD())									\
+      {											\
+	int b= 1 - (P &0x01);								\
+	int c= A - B - b;								\
+	int v= (int8_t)A - (int8_t) B - b;						\
+	fetch();									\
+	A= c;										\
+	setNVZC(A & 0x80, ((A & 0x80) > 0) ^ ((v & 0x100) != 0), A == 0, c >= 0);	\
+	next();										\
+      }											\
+    else										\
+      {											\
+	/* Algorithm taken from http://www.6502.org/tutorials/decimal_mode.html */      \
+	int b= 1 - (P &0x01);								\
+	int l= (A & 0x0F) - (B & 0x0F) - b;	 					\
+	int s= A - B + getC() - 1;							\
+	int c= !(s & 0x100);								\
+	int v= (int8_t)A - (int8_t) B - b;						\
+      	if (s < 0) { s -= 0x60; } 							\
+	if (l < 0) { s -= 0x06; }							\
+	fetch(); 									\
+	A = s;										\
+	/* only C is valid on NMOS 6502 */						\
+	setNVZC(s & 0x80, ((v & 0x80) > 0) ^ ((v & 0x100) != 0), !A, c);		\
+	tick(1);									\
+	next();										\
+      }											\
+  }
+
+#define cmpR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    byte d= R - B;				\
+    setNZC(d & 0x80, !d, R >= B);		\
+  }						\
+  next();
+
+#define cmp(ticks, adrmode)	cmpR(ticks, adrmode, A)
+#define cpx(ticks, adrmode)	cmpR(ticks, adrmode, X)
+#define cpy(ticks, adrmode)	cmpR(ticks, adrmode, Y)
+
+#define dec(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    --B;					\
+    putMemory(ea, B);				\
+    setNZ(B & 0x80, !B);			\
+  }						\
+  next();
+
+#define decR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  --R;						\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define dea(ticks, adrmode)	decR(ticks, adrmode, A)
+#define dex(ticks, adrmode)	decR(ticks, adrmode, X)
+#define dey(ticks, adrmode)	decR(ticks, adrmode, Y)
+
+#define inc(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    ++B;					\
+    putMemory(ea, B);				\
+    setNZ(B & 0x80, !B);			\
+  }						\
+  next();
+
+#define incR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  ++R;						\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define ina(ticks, adrmode)	incR(ticks, adrmode, A)
+#define inx(ticks, adrmode)	incR(ticks, adrmode, X)
+#define iny(ticks, adrmode)	incR(ticks, adrmode, Y)
+
+#define bit(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    P= (P & ~(flagN | flagV | flagZ))		\
+      | (B & (0xC0)) | (((A & B) == 0) << 1);	\
+  }						\
+  next();
+
+/* BIT is unique in varying its behaviour based on addressing mode;
+ * BIT immediate only modifies the Z flag.
+ * http://6502.org/tutorials/65c02opcodes.html
+ */
+#define bim(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    setZ((A & B) == 0);                  	\
+  }						\
+  next();
+
+#define tsb(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte b= getMemory(ea);			\
+    setZ(!(b & A));				\
+    b |= A;					\
+    putMemory(ea, b);				\
+  }						\
+  next();
+
+#define trb(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte b= getMemory(ea);			\
+    setZ(!(b & A));				\
+    b &= (A ^ 0xFF);				\
+    putMemory(ea, b);				\
+  }						\
+  next();
+
+#define bitwise(ticks, adrmode, op)		\
+  adrmode(ticks);				\
+  fetch();					\
+  A op##= getMemory(ea);			\
+  setNZ(A & 0x80, !A);				\
+  next();
+
+#define and(ticks, adrmode)	bitwise(ticks, adrmode, &)
+#define eor(ticks, adrmode)	bitwise(ticks, adrmode, ^)
+#define ora(ticks, adrmode)	bitwise(ticks, adrmode, |)
+
+#define asl(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    unsigned int i= getMemory(ea) << 1;		\
+    putMemory(ea, i);				\
+    fetch();					\
+    setNZC(i & 0x80, !i, i >> 8);		\
+  }						\
+  next();
+
+#define asla(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    int c= A >> 7;				\
+    A <<= 1;					\
+    setNZC(A & 0x80, !A, c);			\
+  }						\
+  next();
+
+#define lsr(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    byte b= getMemory(ea);			\
+    int  c= b & 1;				\
+    fetch();					\
+    b >>= 1;					\
+    putMemory(ea, b);				\
+    setNZC(0, !b, c);				\
+  }						\
+  next();
+
+#define lsra(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    int c= A & 1;				\
+    A >>= 1;					\
+    setNZC(0, !A, c);				\
+  }						\
+  next();
+
+#define rol(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    word b= (getMemory(ea) << 1) | getC();	\
+    fetch();					\
+    putMemory(ea, b);				\
+    setNZC(b & 0x80, !(b & 0xFF), b >> 8);	\
+  }						\
+  next();
+
+#define rola(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    word b= (A << 1) | getC();			\
+    A= b;					\
+    setNZC(A & 0x80, !A, b >> 8);		\
+  }						\
+  next();
+
+#define ror(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    int  c= getC();				\
+    byte m= getMemory(ea);			\
+    byte b= (c << 7) | (m >> 1);		\
+    fetch();					\
+    putMemory(ea, b);				\
+    setNZC(b & 0x80, !b, m & 1);		\
+  }						\
+  next();
+
+#define rora(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    int ci= getC();				\
+    int co= A & 1;				\
+    fetch();					\
+    A= (ci << 7) | (A >> 1);			\
+    setNZC(A & 0x80, !A, co);			\
+  }						\
+  next();
+
+#define tRS(ticks, adrmode, R, S)		\
+  fetch();					\
+  tick(ticks);					\
+  S= R;						\
+  setNZ(S & 0x80, !S);				\
+  next();
+
+#define tax(ticks, adrmode)	tRS(ticks, adrmode, A, X)
+#define txa(ticks, adrmode)	tRS(ticks, adrmode, X, A)
+#define tay(ticks, adrmode)	tRS(ticks, adrmode, A, Y)
+#define tya(ticks, adrmode)	tRS(ticks, adrmode, Y, A)
+#define tsx(ticks, adrmode)	tRS(ticks, adrmode, S, X)
+
+#define txs(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  S= X;						\
+  next();
+
+#define ldR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  R= getMemory(ea);				\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define lda(ticks, adrmode)	ldR(ticks, adrmode, A)
+#define ldx(ticks, adrmode)	ldR(ticks, adrmode, X)
+#define ldy(ticks, adrmode)	ldR(ticks, adrmode, Y)
+
+#define stR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  putMemory(ea, R);				\
+  next();
+
+#define sta(ticks, adrmode)	stR(ticks, adrmode, A)
+#define stx(ticks, adrmode)	stR(ticks, adrmode, X)
+#define sty(ticks, adrmode)	stR(ticks, adrmode, Y)
+#define stz(ticks, adrmode)	stR(ticks, adrmode, 0)
+
+/* We only set keep_running to false if we branch; this is just
+ * an attempt to pick points to JIT at which we have a chance of
+ * hitting a second time. 
+ */
+#define branch(ticks, adrmode, cond)		\
+  if (cond)					\
+    {						\
+      adrmode(ticks);				\
+      PC += ea;					\
+      tick(1);					\
+      keep_running= (instructions_left > 0);    \
+    }						\
+  else						\
+    {						\
+      tick(ticks);				\
+      PC++;					\
+    }						\
+  fetch();					\
+  next();
+
+#define bcc(ticks, adrmode)	branch(ticks, adrmode, !getC())
+#define bcs(ticks, adrmode)	branch(ticks, adrmode,  getC())
+#define bne(ticks, adrmode)	branch(ticks, adrmode, !getZ())
+#define beq(ticks, adrmode)	branch(ticks, adrmode,  getZ())
+#define bpl(ticks, adrmode)	branch(ticks, adrmode, !getN())
+#define bmi(ticks, adrmode)	branch(ticks, adrmode,  getN())
+#define bvc(ticks, adrmode)	branch(ticks, adrmode, !getV())
+#define bvs(ticks, adrmode)	branch(ticks, adrmode,  getV())
+
+#define bra(ticks, adrmode)			\
+  adrmode(ticks);				\
+  PC += ea;					\
+  keep_running= (instructions_left > 0);        \
+  fetch();					\
+  tick(1);					\
+  next();
+
+#define jmp(ticks, adrmode)					\
+  {								\
+      adrmode(ticks);						\
+      byte opcode= mpu->memory[PC-3];                          	\
+      PC= ea;							\
+      if (mpu->callbacks->call[ea])				\
+	{							\
+	  word addr;						\
+	  externalise();					\
+	  if ((addr= mpu->callbacks->call[ea](mpu, ea, opcode)))\
+	    {							\
+	      internalise();					\
+	      PC= addr;						\
+	    }							\
+	}							\
+      keep_running= (instructions_left > 0);        		\
+      fetch();							\
+      next();							\
+  }
+
+#define jsr(ticks, adrmode)				\
+  PC++;							\
+  push(PC >> 8);					\
+  push(PC & 0xff);					\
+  PC--;							\
+  adrmode(ticks);					\
+  if (mpu->callbacks->call[ea])				\
+    {							\
+      word addr;					\
+      externalise();					\
+      if ((addr= mpu->callbacks->call[ea](mpu, ea, 0x20))) \
+	{						\
+	  internalise();				\
+	  PC= addr;					\
+  	  keep_running= (instructions_left > 0);       	\
+	  fetch();					\
+	  next();					\
+	}						\
+    }							\
+  PC=ea;						\
+  keep_running= (instructions_left > 0);        	\
+  fetch();						\
+  next();
+
+#define rts(ticks, adrmode)			\
+  tick(ticks);					\
+  PC  =  pop();					\
+  PC |= (pop() << 8);				\
+  PC++;						\
+  keep_running= (instructions_left > 0);       	\
+  fetch();					\
+  next();
+
+#define brk(ticks, adrmode)					\
+  tick(ticks);							\
+  PC++;								\
+  push(PC >> 8);						\
+  push(PC & 0xff);						\
+  P |= flagB;							\
+  /* http://www.6502.org/tutorials/65c02opcodes.html - unlike
+   * the 6502, the 65C02 clears D on BRK.
+   */								\
+  P &= ~flagD;                                                  \
+  push(P | flagX);						\
+  P |= flagI;							\
+  {								\
+    word hdlr= getMemory(0xfffe) + (getMemory(0xffff) << 8);	\
+    if (mpu->callbacks->call[hdlr])				\
+      {								\
+	word addr;						\
+	externalise();						\
+	if ((addr= mpu->callbacks->call[hdlr](mpu, PC - 2, 0)))	\
+	  {							\
+	    internalise();					\
+	    hdlr= addr;						\
+	  }							\
+      }								\
+    PC= hdlr;							\
+  }								\
+  keep_running= (instructions_left > 0);       			\
+  fetch();							\
+  next();
+
+#define rti(ticks, adrmode)			\
+  tick(ticks);					\
+  P=     pop();					\
+  PC=    pop();					\
+  PC |= (pop() << 8);				\
+  keep_running= (instructions_left > 0);       	\
+  fetch();					\
+  next();
+
+#define nop(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  next();
+
+#define ill(ticks, adrmode)								\
+  {											\
+    word addr= PC-1;									\
+    byte instruction= memory[addr];							\
+    tick(ticks);									\
+    if (mpu->callbacks->illegal_instruction[instruction])				\
+      {											\
+	adrmode(ticks);									\
+	externalise();									\
+        if (addr= (mpu->callbacks->illegal_instruction[instruction](mpu, addr,          \
+								    instruction)))      \
+          {										\
+	    mpu->registers->pc= addr;							\
+          }										\
+	internalise();									\
+        fetch();									\
+	next();										\
+      }											\
+    else										\
+      {											\
+        adrmode(ticks);                                                                 \
+        fetch();                                                                        \
+        next();                                                                         \
+      }											\
+  };
+
+#define phR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  push(R);					\
+  next();
+
+#define pha(ticks, adrmode)	phR(ticks, adrmode, A)
+#define phx(ticks, adrmode)	phR(ticks, adrmode, X)
+#define phy(ticks, adrmode)	phR(ticks, adrmode, Y)
+#define php(ticks, adrmode)	phR(ticks, adrmode, P | flagX | flagB)
+
+#define plR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  R= pop();					\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define pla(ticks, adrmode)	plR(ticks, adrmode, A)
+#define plx(ticks, adrmode)	plR(ticks, adrmode, X)
+#define ply(ticks, adrmode)	plR(ticks, adrmode, Y)
+
+#define plp(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  P= pop();					\
+  next();
+
+#define clF(ticks, adrmode, F)			\
+  fetch();					\
+  tick(ticks);					\
+  P &= ~F;					\
+  next();
+
+#define clc(ticks, adrmode)	clF(ticks, adrmode, flagC)
+#define cld(ticks, adrmode)	clF(ticks, adrmode, flagD)
+#define cli(ticks, adrmode)	clF(ticks, adrmode, flagI)
+#define clv(ticks, adrmode)	clF(ticks, adrmode, flagV)
+
+#define seF(ticks, adrmode, F)			\
+  fetch();					\
+  tick(ticks);					\
+  P |= F;					\
+  next();
+
+#define sec(ticks, adrmode)	seF(ticks, adrmode, flagC)
+#define sed(ticks, adrmode)	seF(ticks, adrmode, flagD)
+#define sei(ticks, adrmode)	seF(ticks, adrmode, flagI)
+
+#define do_insns(_)												\
+  _(00, brk, implied,   7);  _(01, ora, indx,      6);  _(02, ill, zp,        2);  _(03, ill, implied, 2);      \
+  _(04, tsb, zp,        3);  _(05, ora, zp,        3);  _(06, asl, zp,        5);  _(07, ill, implied, 2);      \
+  _(08, php, implied,   3);  _(09, ora, immediate, 3);  _(0a, asla,implied,   2);  _(0b, ill, implied, 2);      \
+  _(0c, tsb, abs,       4);  _(0d, ora, abs,       4);  _(0e, asl, abs,       6);  _(0f, ill, implied, 2);      \
+  _(10, bpl, relative,  2);  _(11, ora, indy,      5);  _(12, ora, indzp,     3);  _(13, ill, implied, 2);      \
+  _(14, trb, zp,        3);  _(15, ora, zpx,       4);  _(16, asl, zpx,       6);  _(17, ill, implied, 2);      \
+  _(18, clc, implied,   2);  _(19, ora, absy,      4);  _(1a, ina, implied,   2);  _(1b, ill, implied, 2);      \
+  _(1c, trb, abs,       4);  _(1d, ora, absx,      4);  _(1e, asl, absx,      7);  _(1f, ill, implied, 2);      \
+  _(20, jsr, abs,       6);  _(21, and, indx,      6);  _(22, ill, zp,        2);  _(23, ill, implied, 2);      \
+  _(24, bit, zp,        3);  _(25, and, zp,        3);  _(26, rol, zp,        5);  _(27, ill, implied, 2);      \
+  _(28, plp, implied,   4);  _(29, and, immediate, 3);  _(2a, rola,implied,   2);  _(2b, ill, implied, 2);      \
+  _(2c, bit, abs,       4);  _(2d, and, abs,       4);  _(2e, rol, abs,       6);  _(2f, ill, implied, 2);      \
+  _(30, bmi, relative,  2);  _(31, and, indy,      5);  _(32, and, indzp,     3);  _(33, ill, implied, 2);      \
+  _(34, bit, zpx,       4);  _(35, and, zpx,       4);  _(36, rol, zpx,       6);  _(37, ill, implied, 2);      \
+  _(38, sec, implied,   2);  _(39, and, absy,      4);  _(3a, dea, implied,   2);  _(3b, ill, implied, 2);      \
+  _(3c, bit, absx,      4);  _(3d, and, absx,      4);  _(3e, rol, absx,      7);  _(3f, ill, implied, 2);      \
+  _(40, rti, implied,   6);  _(41, eor, indx,      6);  _(42, ill, zp,        2);  _(43, ill, implied, 2);      \
+  _(44, ill, zp,        3);  _(45, eor, zp,        3);  _(46, lsr, zp,        5);  _(47, ill, implied, 2);      \
+  _(48, pha, implied,   3);  _(49, eor, immediate, 3);  _(4a, lsra,implied,   2);  _(4b, ill, implied, 2);      \
+  _(4c, jmp, abs,       3);  _(4d, eor, abs,       4);  _(4e, lsr, abs,       6);  _(4f, ill, implied, 2);      \
+  _(50, bvc, relative,  2);  _(51, eor, indy,      5);  _(52, eor, indzp,     3);  _(53, ill, implied, 2);      \
+  _(54, ill, zp,        4);  _(55, eor, zpx,       4);  _(56, lsr, zpx,       6);  _(57, ill, implied, 2);      \
+  _(58, cli, implied,   2);  _(59, eor, absy,      4);  _(5a, phy, implied,   3);  _(5b, ill, implied, 2);      \
+  _(5c, ill, abs,       8);  _(5d, eor, absx,      4);  _(5e, lsr, absx,      7);  _(5f, ill, implied, 2);      \
+  _(60, rts, implied,   6);  _(61, adc, indx,      6);  _(62, ill, zp,        2);  _(63, ill, implied, 2);      \
+  _(64, stz, zp,        3);  _(65, adc, zp,        3);  _(66, ror, zp,        5);  _(67, ill, implied, 2);      \
+  _(68, pla, implied,   4);  _(69, adc, immediate, 3);  _(6a, rora,implied,   2);  _(6b, ill, implied, 2);      \
+  _(6c, jmp, indirect,  5);  _(6d, adc, abs,       4);  _(6e, ror, abs,       6);  _(6f, ill, implied, 2);      \
+  _(70, bvs, relative,  2);  _(71, adc, indy,      5);  _(72, adc, indzp,     3);  _(73, ill, implied, 2);      \
+  _(74, stz, zpx,       4);  _(75, adc, zpx,       4);  _(76, ror, zpx,       6);  _(77, ill, implied, 2);      \
+  _(78, sei, implied,   2);  _(79, adc, absy,      4);  _(7a, ply, implied,   4);  _(7b, ill, implied, 2);      \
+  _(7c, jmp, indabsx,   6);  _(7d, adc, absx,      4);  _(7e, ror, absx,      7);  _(7f, ill, implied, 2);      \
+  _(80, bra, relative,  2);  _(81, sta, indx,      6);  _(82, ill, zp,        2);  _(83, ill, implied, 2);      \
+  _(84, sty, zp,        2);  _(85, sta, zp,        2);  _(86, stx, zp,        2);  _(87, ill, implied, 2);      \
+  _(88, dey, implied,   2);  _(89, bim, immediate, 2);  _(8a, txa, implied,   2);  _(8b, ill, implied, 2);      \
+  _(8c, sty, abs,       4);  _(8d, sta, abs,       4);  _(8e, stx, abs,       4);  _(8f, ill, implied, 2);      \
+  _(90, bcc, relative,  2);  _(91, sta, indy,      6);  _(92, sta, indzp,     3);  _(93, ill, implied, 2);      \
+  _(94, sty, zpx,       4);  _(95, sta, zpx,       4);  _(96, stx, zpy,       4);  _(97, ill, implied, 2);      \
+  _(98, tya, implied,   2);  _(99, sta, absy,      5);  _(9a, txs, implied,   2);  _(9b, ill, implied, 2);      \
+  _(9c, stz, abs,       4);  _(9d, sta, absx,      5);  _(9e, stz, absx,      5);  _(9f, ill, implied, 2);      \
+  _(a0, ldy, immediate, 3);  _(a1, lda, indx,      6);  _(a2, ldx, immediate, 3);  _(a3, ill, implied, 2);      \
+  _(a4, ldy, zp,        3);  _(a5, lda, zp,        3);  _(a6, ldx, zp,        3);  _(a7, ill, implied, 2);      \
+  _(a8, tay, implied,   2);  _(a9, lda, immediate, 3);  _(aa, tax, implied,   2);  _(ab, ill, implied, 2);      \
+  _(ac, ldy, abs,       4);  _(ad, lda, abs,       4);  _(ae, ldx, abs,       4);  _(af, ill, implied, 2);      \
+  _(b0, bcs, relative,  2);  _(b1, lda, indy,      5);  _(b2, lda, indzp,     3);  _(b3, ill, implied, 2);      \
+  _(b4, ldy, zpx,       4);  _(b5, lda, zpx,       4);  _(b6, ldx, zpy,       4);  _(b7, ill, implied, 2);      \
+  _(b8, clv, implied,   2);  _(b9, lda, absy,      4);  _(ba, tsx, implied,   2);  _(bb, ill, implied, 2);      \
+  _(bc, ldy, absx,      4);  _(bd, lda, absx,      4);  _(be, ldx, absy,      4);  _(bf, ill, implied, 2);      \
+  _(c0, cpy, immediate, 3);  _(c1, cmp, indx,      6);  _(c2, ill, zp,        2);  _(c3, ill, implied, 2);      \
+  _(c4, cpy, zp,        3);  _(c5, cmp, zp,        3);  _(c6, dec, zp,        5);  _(c7, ill, implied, 2);      \
+  _(c8, iny, implied,   2);  _(c9, cmp, immediate, 3);  _(ca, dex, implied,   2);  _(cb, ill, implied, 2);      \
+  _(cc, cpy, abs,       4);  _(cd, cmp, abs,       4);  _(ce, dec, abs,       6);  _(cf, ill, implied, 2);      \
+  _(d0, bne, relative,  2);  _(d1, cmp, indy,      5);  _(d2, cmp, indzp,     3);  _(d3, ill, implied, 2);      \
+  _(d4, ill, zp,        4);  _(d5, cmp, zpx,       4);  _(d6, dec, zpx,       6);  _(d7, ill, implied, 2);      \
+  _(d8, cld, implied,   2);  _(d9, cmp, absy,      4);  _(da, phx, implied,   3);  _(db, ill, implied, 2);      \
+  _(dc, ill, abs,       4);  _(dd, cmp, absx,      4);  _(de, dec, absx,      7);  _(df, ill, implied, 2);      \
+  _(e0, cpx, immediate, 3);  _(e1, sbc, indx,      6);  _(e2, ill, zp,        2);  _(e3, ill, implied, 2);      \
+  _(e4, cpx, zp,        3);  _(e5, sbc, zp,        3);  _(e6, inc, zp,        5);  _(e7, ill, implied, 2);      \
+  _(e8, inx, implied,   2);  _(e9, sbc, immediate, 3);  _(ea, nop, implied,   2);  _(eb, ill, implied, 2);      \
+  _(ec, cpx, abs,       4);  _(ed, sbc, abs,       4);  _(ee, inc, abs,       6);  _(ef, ill, implied, 2);      \
+  _(f0, beq, relative,  2);  _(f1, sbc, indy,      5);  _(f2, sbc, indzp,     3);  _(f3, ill, implied, 2);      \
+  _(f4, ill, zp,        4);  _(f5, sbc, zpx,       4);  _(f6, inc, zpx,       6);  _(f7, ill, implied, 2);      \
+  _(f8, sed, implied,   2);  _(f9, sbc, absy,      4);  _(fa, plx, implied,   4);  _(fb, ill, implied, 2);      \
+  _(fc, ill, abs,       4);  _(fd, sbc, absx,      4);  _(fe, inc, absx,      7);  _(ff, ill, implied, 2);
+
+
+
+void M6502_irq(M6502 *mpu)
+{
+  if (!(mpu->registers->p & flagI))
+    {
+      mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc >> 8);
+      mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc & 0xff);
+      mpu->memory[0x0100 + mpu->registers->s--] = mpu->registers->p;
+      mpu->registers->p &= ~flagB;
+      mpu->registers->p |=  flagI;
+      mpu->registers->pc = M6502_getVector(mpu, IRQ);
+    }
+}
+
+
+void M6502_nmi(M6502 *mpu)
+{
+  mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc >> 8);
+  mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc & 0xff);
+  mpu->memory[0x0100 + mpu->registers->s--] = mpu->registers->p;
+  mpu->registers->p &= ~flagB;
+  mpu->registers->p |=  flagI;
+  mpu->registers->pc = M6502_getVector(mpu, NMI);
+}
+
+
+void M6502_reset(M6502 *mpu)
+{
+  mpu->registers->p &= ~flagD;
+  mpu->registers->p |=  flagI;
+  mpu->registers->pc = M6502_getVector(mpu, RST);
+}
+
+
+/* the compiler should elminate all call to this function */
+
+static void oops(void)
+{
+  fprintf(stderr, "\noops -- instruction dispatch missing\n");
+}
+
+
+void M6502_run_interpreted(M6502 *mpu, int instructions_left)
+{
+  int keep_running= 1;
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+
+  static void *itab[256]= { &&_00, &&_01, &&_02, &&_03, &&_04, &&_05, &&_06, &&_07, &&_08, &&_09, &&_0a, &&_0b, &&_0c, &&_0d, &&_0e, &&_0f,
+			    &&_10, &&_11, &&_12, &&_13, &&_14, &&_15, &&_16, &&_17, &&_18, &&_19, &&_1a, &&_1b, &&_1c, &&_1d, &&_1e, &&_1f,
+			    &&_20, &&_21, &&_22, &&_23, &&_24, &&_25, &&_26, &&_27, &&_28, &&_29, &&_2a, &&_2b, &&_2c, &&_2d, &&_2e, &&_2f,
+			    &&_30, &&_31, &&_32, &&_33, &&_34, &&_35, &&_36, &&_37, &&_38, &&_39, &&_3a, &&_3b, &&_3c, &&_3d, &&_3e, &&_3f,
+			    &&_40, &&_41, &&_42, &&_43, &&_44, &&_45, &&_46, &&_47, &&_48, &&_49, &&_4a, &&_4b, &&_4c, &&_4d, &&_4e, &&_4f,
+			    &&_50, &&_51, &&_52, &&_53, &&_54, &&_55, &&_56, &&_57, &&_58, &&_59, &&_5a, &&_5b, &&_5c, &&_5d, &&_5e, &&_5f,
+			    &&_60, &&_61, &&_62, &&_63, &&_64, &&_65, &&_66, &&_67, &&_68, &&_69, &&_6a, &&_6b, &&_6c, &&_6d, &&_6e, &&_6f,
+			    &&_70, &&_71, &&_72, &&_73, &&_74, &&_75, &&_76, &&_77, &&_78, &&_79, &&_7a, &&_7b, &&_7c, &&_7d, &&_7e, &&_7f,
+			    &&_80, &&_81, &&_82, &&_83, &&_84, &&_85, &&_86, &&_87, &&_88, &&_89, &&_8a, &&_8b, &&_8c, &&_8d, &&_8e, &&_8f,
+			    &&_90, &&_91, &&_92, &&_93, &&_94, &&_95, &&_96, &&_97, &&_98, &&_99, &&_9a, &&_9b, &&_9c, &&_9d, &&_9e, &&_9f,
+			    &&_a0, &&_a1, &&_a2, &&_a3, &&_a4, &&_a5, &&_a6, &&_a7, &&_a8, &&_a9, &&_aa, &&_ab, &&_ac, &&_ad, &&_ae, &&_af,
+			    &&_b0, &&_b1, &&_b2, &&_b3, &&_b4, &&_b5, &&_b6, &&_b7, &&_b8, &&_b9, &&_ba, &&_bb, &&_bc, &&_bd, &&_be, &&_bf,
+			    &&_c0, &&_c1, &&_c2, &&_c3, &&_c4, &&_c5, &&_c6, &&_c7, &&_c8, &&_c9, &&_ca, &&_cb, &&_cc, &&_cd, &&_ce, &&_cf,
+			    &&_d0, &&_d1, &&_d2, &&_d3, &&_d4, &&_d5, &&_d6, &&_d7, &&_d8, &&_d9, &&_da, &&_db, &&_dc, &&_dd, &&_de, &&_df,
+			    &&_e0, &&_e1, &&_e2, &&_e3, &&_e4, &&_e5, &&_e6, &&_e7, &&_e8, &&_e9, &&_ea, &&_eb, &&_ec, &&_ed, &&_ee, &&_ef,
+			    &&_f0, &&_f1, &&_f2, &&_f3, &&_f4, &&_f5, &&_f6, &&_f7, &&_f8, &&_f9, &&_fa, &&_fb, &&_fc, &&_fd, &&_fe, &&_ff };
+
+  register void **itabp= &itab[0];
+  register void  *tpc;
+
+# define begin()				++instructions_left;  fetch();  next()
+# define fetch()				tpc= itabp[memory[PC++]]
+# define next()					--instructions_left;  if (keep_running) goto *tpc; else goto done
+# define dispatch(num, name, mode, cycles)	_##num: name(cycles, mode) oops();  next()
+# define end()					done: --PC
+
+#else /* (!__GNUC__) || (__STRICT_ANSI__) */
+
+# define begin()				for (;keep_running;--instructions_left) switch (memory[PC++]) {
+# define fetch()
+# define next()					break
+# define dispatch(num, name, mode, cycles)	case 0x##num: name(cycles, mode);  next()
+# define end()					}
+
+#endif
+
+  register byte  *memory= mpu->memory;
+  register word   PC;
+  word		  ea;
+  byte		  A, X, Y, P, S;
+  M6502_Callback *readCallback=  mpu->callbacks->read;
+  M6502_Callback *writeCallback= mpu->callbacks->write;
+
+# define internalise()	A= mpu->registers->a;  X= mpu->registers->x;  Y= mpu->registers->y;  P= mpu->registers->p;  S= mpu->registers->s;  PC= mpu->registers->pc
+# define externalise()	mpu->registers->a= A;  mpu->registers->x= X;  mpu->registers->y= Y;  mpu->registers->p= P;  mpu->registers->s= S;  mpu->registers->pc= PC
+
+  internalise();
+
+  begin();
+  do_insns(dispatch);
+  end();
+
+  externalise();
+
+# undef begin
+# undef internalise
+# undef externalise
+# undef fetch
+# undef next
+# undef dispatch
+# undef end
+}
+
+
+int M6502_disassemble(M6502 *mpu, word ip, char buffer[64])
+{
+  char *s= buffer;
+  byte *b= mpu->memory + ip;
+
+  switch (b[0])
+    {
+#    define _implied							    return 1;
+#    define _immediate	sprintf(s, "#%02X",	   b[1]);		    return 2;
+#    define _zp		sprintf(s, "%02X",	   b[1]);		    return 2;
+#    define _zpx	sprintf(s, "%02X,X",	   b[1]);		    return 2;
+#    define _zpy	sprintf(s, "%02X,Y",	   b[1]);		    return 2;
+#    define _abs	sprintf(s, "%02X%02X",	   b[2], b[1]);		    return 3;
+#    define _absx	sprintf(s, "%02X%02X,X",   b[2], b[1]);		    return 3;
+#    define _absy	sprintf(s, "%02X%02X,Y",   b[2], b[1]);		    return 3;
+#    define _relative	sprintf(s, "%04X",	   ip + 2 + (int8_t)b[1]);  return 2;
+#    define _indirect	sprintf(s, "(%02X%02X)",   b[2], b[1]);		    return 3;
+#    define _indzp	sprintf(s, "(%02X)",	   b[1]);		    return 2;
+#    define _indx	sprintf(s, "(%02X,X)",	   b[1]);		    return 2;
+#    define _indy	sprintf(s, "(%02X),Y",	   b[1]);		    return 2;
+#    define _indabsx	sprintf(s, "(%02X%02X,X)", b[2], b[1]);		    return 3;
+
+#    define disassemble(num, name, mode, cycles) case 0x##num: s += sprintf(s, "%s ", #name); _##mode
+      do_insns(disassemble);
+#    undef _do
+    }
+
+  return 0;
+}
+
+
+void M6502_dump(M6502 *mpu, char buffer[64])
+{
+  M6502_Registers *r= mpu->registers;
+  uint8_t p= r->p;
+# define P(N,C) (p & (1 << (N)) ? (C) : '-')
+  sprintf(buffer, "PC=%04X SP=%04X A=%02X X=%02X Y=%02X P=%02X %c%c%c%c%c%c%c%c",
+	  r->pc, 0x0100 + r->s,
+	  r->a, r->x, r->y, r->p,
+	  P(7,'N'), P(6,'V'), P(5,'?'), P(4,'B'), P(3,'D'), P(2,'I'), P(1,'Z'), P(0,'C'));
+# undef P
+}
diff --git a/lib6502.h b/lib6502.h
new file mode 100644
index 0000000..41fc9f2
--- /dev/null
+++ b/lib6502.h
@@ -0,0 +1,120 @@
+/* lib6502.h -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef __m6502_h
+#define __m6502_h
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+  extern "C"
+{
+#endif
+
+typedef struct _M6502		M6502;
+typedef struct _M6502_Registers	M6502_Registers;
+typedef struct _M6502_Callbacks	M6502_Callbacks;
+typedef struct _M6502_Internal  M6502_Internal;
+
+typedef int   (*M6502_Callback)(M6502 *mpu, uint16_t address, uint8_t data);
+
+typedef M6502_Callback	M6502_CallbackTable[0x10000];
+typedef M6502_Callback	M6502_IllegalInstructionCallbackTable[0x100];
+typedef uint8_t		M6502_Memory[0x10000];
+
+enum {
+  M6502_NMIVector= 0xfffa,  M6502_NMIVectorLSB= 0xfffa,  M6502_NMIVectorMSB= 0xfffb,
+  M6502_RSTVector= 0xfffc,  M6502_RSTVectorLSB= 0xfffc,  M6502_RSTVectorMSB= 0xfffd,
+  M6502_IRQVector= 0xfffe,  M6502_IRQVectorLSB= 0xfffe,  M6502_IRQVectorMSB= 0xffff
+};
+
+struct _M6502_Registers
+{
+  uint8_t   a;	/* accumulator */
+  uint8_t   x;	/* X index register */
+  uint8_t   y;	/* Y index register */
+  uint8_t   p;	/* processor status register */
+  uint8_t   s;	/* stack pointer */
+  uint16_t pc;	/* program counter */
+};
+
+struct _M6502_Callbacks
+{
+  M6502_CallbackTable read;
+  M6502_CallbackTable write;
+  M6502_CallbackTable call;
+  M6502_IllegalInstructionCallbackTable illegal_instruction;
+};
+
+struct _M6502_Internal;
+
+struct _M6502
+{
+  M6502_Registers *registers;
+  uint8_t	  *memory;
+  M6502_Callbacks *callbacks;
+  unsigned int	   flags;
+
+  /* The following is implementation-specific; client code should only use the
+   * above members.
+   */
+  M6502_Internal  *internal;
+};
+
+enum {
+  M6502_RegistersAllocated = 1 << 0,
+  M6502_MemoryAllocated    = 1 << 1,
+  M6502_CallbacksAllocated = 1 << 2
+};
+
+typedef enum {
+  M6502_ModeInterpreted,
+  M6502_ModeCompiled,
+  M6502_ModeHybrid
+} M6502_Mode;
+
+extern M6502 *M6502_new(M6502_Registers *registers, M6502_Memory memory, M6502_Callbacks *callbacks);
+extern void   M6502_reset(M6502 *mpu);
+extern void   M6502_nmi(M6502 *mpu);
+extern void   M6502_irq(M6502 *mpu);
+extern void   M6502_run(M6502 *mpu);
+extern int    M6502_disassemble(M6502 *mpu, uint16_t addr, char buffer[64]);
+extern void   M6502_dump(M6502 *mpu, char buffer[64]);
+extern void   M6502_delete(M6502 *mpu);
+extern void   M6502_setMode(M6502 *mpu, M6502_Mode mode, int arg);
+
+#define M6502_getVector(MPU, VEC)			\
+  ( ( ((MPU)->memory[M6502_##VEC##VectorLSB]) )		\
+    | ((MPU)->memory[M6502_##VEC##VectorMSB] << 8) )
+
+#define M6502_setVector(MPU, VEC, ADDR)						\
+  ( ( ((MPU)->memory[M6502_##VEC##VectorLSB]= ((uint8_t)(ADDR)) & 0xff) )	\
+    , ((MPU)->memory[M6502_##VEC##VectorMSB]= (uint8_t)((ADDR) >> 8)) )
+
+#define M6502_getCallback(MPU, TYPE, ADDR)	((MPU)->callbacks->TYPE[ADDR])
+#define M6502_setCallback(MPU, TYPE, ADDR, FN)	((MPU)->callbacks->TYPE[ADDR]= (FN))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __m6502_h */
diff --git a/m4/boost.m4 b/m4/boost.m4
new file mode 100644
index 0000000..0a46b0e
--- /dev/null
+++ b/m4/boost.m4
@@ -0,0 +1,1338 @@
+# boost.m4: Locate Boost headers and libraries for autoconf-based projects.
+# Copyright (C) 2007-2011, 2014  Benoit Sigoure <tsuna@lrde.epita.fr>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Additional permission under section 7 of the GNU General Public
+# License, version 3 ("GPLv3"):
+#
+# If you convey this file as part of a work that contains a
+# configuration script generated by Autoconf, you may do so under
+# terms of your choice.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+m4_define([_BOOST_SERIAL], [m4_translit([
+# serial 22
+], [#
+], [])])
+
+# Original sources can be found at http://github.com/tsuna/boost.m4
+# You can fetch the latest version of the script by doing:
+#   wget http://github.com/tsuna/boost.m4/raw/master/build-aux/boost.m4
+
+# ------ #
+# README #
+# ------ #
+
+# This file provides several macros to use the various Boost libraries.
+# The first macro is BOOST_REQUIRE.  It will simply check if it's possible to
+# find the Boost headers of a given (optional) minimum version and it will
+# define BOOST_CPPFLAGS accordingly.  It will add an option --with-boost to
+# your configure so that users can specify non standard locations.
+# If the user's environment contains BOOST_ROOT and --with-boost was not
+# specified, --with-boost=$BOOST_ROOT is implicitly used.
+# For more README and documentation, go to http://github.com/tsuna/boost.m4
+# Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL.  If you don't, don't worry,
+# simply read the README, it will show you what to do step by step.
+
+m4_pattern_forbid([^_?(BOOST|Boost)_])
+
+
+# _BOOST_SED_CPP(SED-PROGRAM, PROGRAM,
+#                [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# --------------------------------------------------------
+# Same as AC_EGREP_CPP, but leave the result in conftest.i.
+#
+# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP.  It is expanded
+# in double-quotes, so escape your double quotes.
+#
+# It could be useful to turn this into a macro which extracts the
+# value of any macro.
+m4_define([_BOOST_SED_CPP],
+[AC_LANG_PUSH([C++])dnl
+AC_LANG_PREPROC_REQUIRE()dnl
+AC_REQUIRE([AC_PROG_SED])dnl
+AC_LANG_CONFTEST([AC_LANG_SOURCE([[$2]])])
+AS_IF([dnl eval is necessary to expand ac_cpp.
+dnl Ultrix and Pyramid sh refuse to redirect output of eval, so use subshell.
+dnl Beware of Windows end-of-lines, for instance if we are running
+dnl some Windows programs under Wine.  In that case, boost/version.hpp
+dnl is certainly using "\r\n", but the regular Unix shell will only
+dnl strip `\n' with backquotes, not the `\r'.  This results in
+dnl boost_cv_lib_version='1_37\r' for instance, which breaks
+dnl everything else.
+dnl Cannot use 'dnl' after [$4] because a trailing dnl may break AC_CACHE_CHECK
+(eval "$ac_cpp conftest.$ac_ext") 2>&AS_MESSAGE_LOG_FD |
+  tr -d '\r' |
+  $SED -n -e "$1" >conftest.i 2>&1],
+  [$3],
+  [$4])
+rm -rf conftest*
+AC_LANG_POP([C++])dnl
+])# _BOOST_SED_CPP
+
+
+
+# BOOST_REQUIRE([VERSION], [ACTION-IF-NOT-FOUND])
+# -----------------------------------------------
+# Look for Boost.  If version is given, it must either be a literal of the form
+# "X.Y.Z" where X, Y and Z are integers (the ".Z" part being optional) or a
+# variable "$var".
+# Defines the value BOOST_CPPFLAGS.  This macro only checks for headers with
+# the required version, it does not check for any of the Boost libraries.
+# On # success, defines HAVE_BOOST.  On failure, calls the optional
+# ACTION-IF-NOT-FOUND action if one was supplied.
+# Otherwise aborts with an error message.
+AC_DEFUN([BOOST_REQUIRE],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_PROG_GREP])dnl
+echo "$as_me: this is boost.m4[]_BOOST_SERIAL" >&AS_MESSAGE_LOG_FD
+boost_save_IFS=$IFS
+boost_version_req=$1
+IFS=.
+set x $boost_version_req 0 0 0
+IFS=$boost_save_IFS
+shift
+boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"`
+boost_version_req_string=$[1].$[2].$[3]
+AC_ARG_WITH([boost],
+   [AS_HELP_STRING([--with-boost=DIR],
+                   [prefix of Boost $1 @<:@guess@:>@])])dnl
+AC_ARG_VAR([BOOST_ROOT],[Location of Boost installation])dnl
+# If BOOST_ROOT is set and the user has not provided a value to
+# --with-boost, then treat BOOST_ROOT as if it the user supplied it.
+if test x"$BOOST_ROOT" != x; then
+  if test x"$with_boost" = x; then
+    AC_MSG_NOTICE([Detected BOOST_ROOT; continuing with --with-boost=$BOOST_ROOT])
+    with_boost=$BOOST_ROOT
+  else
+    AC_MSG_NOTICE([Detected BOOST_ROOT=$BOOST_ROOT, but overridden by --with-boost=$with_boost])
+  fi
+fi
+AC_SUBST([DISTCHECK_CONFIGURE_FLAGS],
+         ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+  AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string],
+    [boost_cv_inc_path],
+    [boost_cv_inc_path=no
+AC_LANG_PUSH([C++])dnl
+m4_pattern_allow([^BOOST_VERSION$])dnl
+    AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <boost/version.hpp>
+#if !defined BOOST_VERSION
+# error BOOST_VERSION is not defined
+#elif BOOST_VERSION < $boost_version_req
+# error Boost headers version < $boost_version_req
+#endif
+]])])
+    # If the user provided a value to --with-boost, use it and only it.
+    case $with_boost in #(
+      ''|yes) set x '' /opt/local/include /usr/local/include /opt/include \
+                 /usr/include C:/Boost/include;; #(
+      *)      set x "$with_boost/include" "$with_boost";;
+    esac
+    shift
+    for boost_dir
+    do
+    # Without --layout=system, Boost (or at least some versions) installs
+    # itself in <prefix>/include/boost-<version>.  This inner loop helps to
+    # find headers in such directories.
+    #
+    # Any ${boost_dir}/boost-x_xx directories are searched in reverse version
+    # order followed by ${boost_dir}.  The final '.' is a sentinel for
+    # searching $boost_dir" itself.  Entries are whitespace separated.
+    #
+    # I didn't indent this loop on purpose (to avoid over-indented code)
+    boost_layout_system_search_list=`cd "$boost_dir" 2>/dev/null \
+        && ls -1 | "${GREP}" '^boost-' | sort -rn -t- -k2 \
+        && echo .`
+    for boost_inc in $boost_layout_system_search_list
+    do
+      if test x"$boost_inc" != x.; then
+        boost_inc="$boost_dir/$boost_inc"
+      else
+        boost_inc="$boost_dir" # Uses sentinel in boost_layout_system_search_list
+      fi
+      if test x"$boost_inc" != x; then
+        # We are going to check whether the version of Boost installed
+        # in $boost_inc is usable by running a compilation that
+        # #includes it.  But if we pass a -I/some/path in which Boost
+        # is not installed, the compiler will just skip this -I and
+        # use other locations (either from CPPFLAGS, or from its list
+        # of system include directories).  As a result we would use
+        # header installed on the machine instead of the /some/path
+        # specified by the user.  So in that precise case (trying
+        # $boost_inc), make sure the version.hpp exists.
+        #
+        # Use test -e as there can be symlinks.
+        test -e "$boost_inc/boost/version.hpp" || continue
+        CPPFLAGS="$CPPFLAGS -I$boost_inc"
+      fi
+      AC_COMPILE_IFELSE([], [boost_cv_inc_path=yes], [boost_cv_version=no])
+      if test x"$boost_cv_inc_path" = xyes; then
+        if test x"$boost_inc" != x; then
+          boost_cv_inc_path=$boost_inc
+        fi
+        break 2
+      fi
+    done
+    done
+AC_LANG_POP([C++])dnl
+    ])
+    case $boost_cv_inc_path in #(
+      no)
+        boost_errmsg="cannot find Boost headers version >= $boost_version_req_string"
+        m4_if([$2], [],  [AC_MSG_ERROR([$boost_errmsg])],
+                        [AC_MSG_NOTICE([$boost_errmsg])])
+        $2
+        ;;#(
+      yes)
+        BOOST_CPPFLAGS=
+        ;;#(
+      *)
+        AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl
+        ;;
+    esac
+  if test x"$boost_cv_inc_path" != xno; then
+  AC_DEFINE([HAVE_BOOST], [1],
+            [Defined if the requested minimum BOOST version is satisfied])
+  AC_CACHE_CHECK([for Boost's header version],
+    [boost_cv_lib_version],
+    [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl
+     _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}],
+                    [#include <boost/version.hpp>
+boost-lib-version = BOOST_LIB_VERSION],
+    [boost_cv_lib_version=`cat conftest.i`])])
+    # e.g. "134" for 1_34_1 or "135" for 1_35
+    boost_major_version=`echo "$boost_cv_lib_version" | sed 's/_//;s/_.*//'`
+    case $boost_major_version in #(
+      '' | *[[!0-9]]*)
+        AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version])
+        ;;
+    esac
+fi
+CPPFLAGS=$boost_save_CPPFLAGS
+])# BOOST_REQUIRE
+
+
+# BOOST_STATIC()
+# --------------
+# Add the "--enable-static-boost" configure argument. If this argument is given
+# on the command line, static versions of the libraries will be looked up.
+AC_DEFUN([BOOST_STATIC],
+  [AC_ARG_ENABLE([static-boost],
+     [AS_HELP_STRING([--enable-static-boost],
+               [Prefer the static boost libraries over the shared ones [no]])],
+     [enable_static_boost=yes],
+     [enable_static_boost=no])])# BOOST_STATIC
+
+
+# BOOST_FIND_HEADER([HEADER-NAME], [ACTION-IF-NOT-FOUND], [ACTION-IF-FOUND])
+# --------------------------------------------------------------------------
+# Wrapper around AC_CHECK_HEADER for Boost headers.  Useful to check for
+# some parts of the Boost library which are only made of headers and don't
+# require linking (such as Boost.Foreach).
+#
+# Default ACTION-IF-NOT-FOUND: Fail with a fatal error unless Boost couldn't be
+# found in the first place, in which case by default a notice is issued to the
+# user.  Presumably if we haven't died already it's because it's OK to not have
+# Boost, which is why only a notice is issued instead of a hard error.
+#
+# Default ACTION-IF-FOUND: define the preprocessor symbol HAVE_<HEADER-NAME> in
+# case of success # (where HEADER-NAME is written LIKE_THIS, e.g.,
+# HAVE_BOOST_FOREACH_HPP).
+AC_DEFUN([BOOST_FIND_HEADER],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  m4_default([$2], [AC_MSG_NOTICE([Boost not available, not searching for $1])])
+else
+AC_LANG_PUSH([C++])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CHECK_HEADER([$1],
+  [m4_default([$3], [AC_DEFINE(AS_TR_CPP([HAVE_$1]), [1],
+                               [Define to 1 if you have <$1>])])],
+  [m4_default([$2], [AC_MSG_ERROR([cannot find $1])])])
+CPPFLAGS=$boost_save_CPPFLAGS
+AC_LANG_POP([C++])dnl
+fi
+])# BOOST_FIND_HEADER
+
+
+# BOOST_FIND_LIBS([COMPONENT-NAME], [CANDIDATE-LIB-NAMES],
+#                 [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                 [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Look for the Boost library COMPONENT-NAME (e.g., `thread', for
+# libboost_thread) under the possible CANDIDATE-LIB-NAMES (e.g.,
+# "thread_win32 thread").  Check that HEADER-NAME works and check that
+# libboost_LIB-NAME can link with the code CXX-TEST.  The optional
+# argument CXX-PROLOGUE can be used to include some C++ code before
+# the `main' function.
+#
+# Invokes BOOST_FIND_HEADER([HEADER-NAME]) (see above).
+#
+# Boost libraries typically come compiled with several flavors (with different
+# runtime options) so PREFERRED-RT-OPT is the preferred suffix.  A suffix is one
+# or more of the following letters: sgdpn (in that order).  s = static
+# runtime, d = debug build, g = debug/diagnostic runtime, p = STLPort build,
+# n = (unsure) STLPort build without iostreams from STLPort (it looks like `n'
+# must always be used along with `p').  Additionally, PREFERRED-RT-OPT can
+# start with `mt-' to indicate that there is a preference for multi-thread
+# builds.  Some sample values for PREFERRED-RT-OPT: (nothing), mt, d, mt-d, gdp
+# ...  If you want to make sure you have a specific version of Boost
+# (eg, >= 1.33) you *must* invoke BOOST_REQUIRE before this macro.
+AC_DEFUN([BOOST_FIND_LIBS],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+AC_REQUIRE([_BOOST_FIND_COMPILER_TAG])dnl
+AC_REQUIRE([BOOST_STATIC])dnl
+AC_REQUIRE([_BOOST_GUESS_WHETHER_TO_USE_MT])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  AC_MSG_NOTICE([Boost not available, not searching for the Boost $1 library])
+else
+dnl The else branch is huge and wasn't intended on purpose.
+AC_LANG_PUSH([C++])dnl
+AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl
+AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl
+BOOST_FIND_HEADER([$4])
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CACHE_CHECK([for the Boost $1 library], [Boost_lib],
+               [_BOOST_FIND_LIBS($@)])
+case $Boost_lib in #(
+  (no) _AC_MSG_LOG_CONFTEST
+    AC_MSG_ERROR([cannot find the flags to link with Boost $1])
+    ;;
+esac
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl
+AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl
+CPPFLAGS=$boost_save_CPPFLAGS
+AS_VAR_POPDEF([Boost_lib])dnl
+AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl
+AS_VAR_POPDEF([Boost_lib_LDPATH])dnl
+AS_VAR_POPDEF([Boost_lib_LIBS])dnl
+AC_LANG_POP([C++])dnl
+fi
+])
+
+
+# BOOST_FIND_LIB([LIB-NAME],
+#                [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Backward compatibility wrapper for BOOST_FIND_LIBS.
+AC_DEFUN([BOOST_FIND_LIB],
+[BOOST_FIND_LIBS([$1], $@)])
+
+
+# _BOOST_FIND_LIBS([LIB-NAME], [CANDIDATE-LIB-NAMES],
+#                 [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                 [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Real implementation of BOOST_FIND_LIBS: rely on these local macros:
+# Boost_lib, Boost_lib_LDFLAGS, Boost_lib_LDPATH, Boost_lib_LIBS
+#
+# The algorithm is as follows: first look for a given library name
+# according to the user's PREFERRED-RT-OPT.  For each library name, we
+# prefer to use the ones that carry the tag (toolset name).  Each
+# library is searched through the various standard paths were Boost is
+# usually installed.  If we can't find the standard variants, we try
+# to enforce -mt (for instance on MacOSX, libboost_thread.dylib
+# doesn't exist but there's -obviously- libboost_thread-mt.dylib).
+AC_DEFUN([_BOOST_FIND_LIBS],
+[Boost_lib=no
+  case "$3" in #(
+    (mt | mt-) boost_mt=-mt; boost_rtopt=;; #(
+    (mt* | mt-*) boost_mt=-mt; boost_rtopt=`expr "X$3" : 'Xmt-*\(.*\)'`;; #(
+    (*) boost_mt=; boost_rtopt=$3;;
+  esac
+  if test $enable_static_boost = yes; then
+    boost_rtopt="s$boost_rtopt"
+  fi
+  # Find the proper debug variant depending on what we've been asked to find.
+  case $boost_rtopt in #(
+    (*d*) boost_rt_d=$boost_rtopt;; #(
+    (*[[sgpn]]*) # Insert the `d' at the right place (in between `sg' and `pn')
+      boost_rt_d=`echo "$boost_rtopt" | sed 's/\(s*g*\)\(p*n*\)/\1\2/'`;; #(
+    (*) boost_rt_d='-d';;
+  esac
+  # If the PREFERRED-RT-OPT are not empty, prepend a `-'.
+  test -n "$boost_rtopt" && boost_rtopt="-$boost_rtopt"
+  $boost_guess_use_mt && boost_mt=-mt
+  # Look for the abs path the static archive.
+  # $libext is computed by Libtool but let's make sure it's non empty.
+  test -z "$libext" &&
+    AC_MSG_ERROR([the libext variable is empty, did you invoke Libtool?])
+  boost_save_ac_objext=$ac_objext
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <$4>
+$6], [$5])])
+dnl Optimization hacks: compiling C++ is slow, especially with Boost.  What
+dnl we're trying to do here is guess the right combination of link flags
+dnl (LIBS / LDFLAGS) to use a given library.  This can take several
+dnl iterations before it succeeds and is thus *very* slow.  So what we do
+dnl instead is that we compile the code first (and thus get an object file,
+dnl typically conftest.o).  Then we try various combinations of link flags
+dnl until we succeed to link conftest.o in an executable.  The problem is
+dnl that the various TRY_LINK / COMPILE_IFELSE macros of Autoconf always
+dnl remove all the temporary files including conftest.o.  So the trick here
+dnl is to temporarily change the value of ac_objext so that conftest.o is
+dnl preserved accross tests.  This is obviously fragile and I will burn in
+dnl hell for not respecting Autoconf's documented interfaces, but in the
+dnl mean time, it optimizes the macro by a factor of 5 to 30.
+dnl Another small optimization: the first argument of AC_COMPILE_IFELSE left
+dnl empty because the test file is generated only once above (before we
+dnl start the for loops).
+  AC_COMPILE_IFELSE([],
+    [ac_objext=do_not_rm_me_plz],
+    [AC_MSG_ERROR([cannot compile a test that uses Boost $1])])
+  ac_objext=$boost_save_ac_objext
+  boost_failed_libs=
+# Don't bother to ident the following nested for loops, only the 2
+# innermost ones matter.
+for boost_lib_ in $2; do
+for boost_tag_ in -$boost_cv_lib_tag ''; do
+for boost_ver_ in -$boost_cv_lib_version ''; do
+for boost_mt_ in $boost_mt -mt ''; do
+for boost_rtopt_ in $boost_rtopt '' -d; do
+  for boost_lib in \
+    boost_$boost_lib_$boost_tag_$boost_mt_$boost_rtopt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_rtopt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_mt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_ver_
+  do
+    # Avoid testing twice the same lib
+    case $boost_failed_libs in #(
+      (*@$boost_lib@*) continue;;
+    esac
+    # If with_boost is empty, we'll search in /lib first, which is not quite
+    # right so instead we'll try to a location based on where the headers are.
+    boost_tmp_lib=$with_boost
+    test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include}
+    for boost_ldpath in "$boost_tmp_lib/lib" '' \
+             /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \
+             "$with_boost" C:/Boost/lib /lib*
+    do
+      # Don't waste time with directories that don't exist.
+      if test x"$boost_ldpath" != x && test ! -e "$boost_ldpath"; then
+        continue
+      fi
+      boost_save_LDFLAGS=$LDFLAGS
+      # Are we looking for a static library?
+      case $boost_ldpath:$boost_rtopt_ in #(
+        (*?*:*s*) # Yes (Non empty boost_ldpath + s in rt opt)
+          Boost_lib_LIBS="$boost_ldpath/lib$boost_lib.$libext"
+          test -e "$Boost_lib_LIBS" || continue;; #(
+        (*) # No: use -lboost_foo to find the shared library.
+          Boost_lib_LIBS="-l$boost_lib";;
+      esac
+      boost_save_LIBS=$LIBS
+      LIBS="$Boost_lib_LIBS $LIBS"
+      test x"$boost_ldpath" != x && LDFLAGS="$LDFLAGS -L$boost_ldpath"
+dnl First argument of AC_LINK_IFELSE left empty because the test file is
+dnl generated only once above (before we start the for loops).
+      _BOOST_AC_LINK_IFELSE([],
+                            [Boost_lib=yes], [Boost_lib=no])
+      ac_objext=$boost_save_ac_objext
+      LDFLAGS=$boost_save_LDFLAGS
+      LIBS=$boost_save_LIBS
+      if test x"$Boost_lib" = xyes; then
+        # Check or used cached result of whether or not using -R or
+        # -rpath makes sense.  Some implementations of ld, such as for
+        # Mac OSX, require -rpath but -R is the flag known to work on
+        # other systems.  https://github.com/tsuna/boost.m4/issues/19
+        AC_CACHE_VAL([boost_cv_rpath_link_ldflag],
+          [case $boost_ldpath in
+           '') # Nothing to do.
+             boost_cv_rpath_link_ldflag=
+             boost_rpath_link_ldflag_found=yes;;
+           *)
+            for boost_cv_rpath_link_ldflag in -Wl,-R, -Wl,-rpath,; do
+              LDFLAGS="$boost_save_LDFLAGS -L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+              LIBS="$boost_save_LIBS $Boost_lib_LIBS"
+              _BOOST_AC_LINK_IFELSE([],
+                [boost_rpath_link_ldflag_found=yes
+                break],
+                [boost_rpath_link_ldflag_found=no])
+            done
+            ;;
+          esac
+          AS_IF([test "x$boost_rpath_link_ldflag_found" != "xyes"],
+            [AC_MSG_ERROR([Unable to determine whether to use -R or -rpath])])
+          LDFLAGS=$boost_save_LDFLAGS
+          LIBS=$boost_save_LIBS
+          ])
+        test x"$boost_ldpath" != x &&
+          Boost_lib_LDFLAGS="-L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+        Boost_lib_LDPATH="$boost_ldpath"
+        break 7
+      else
+        boost_failed_libs="$boost_failed_libs@$boost_lib@"
+      fi
+    done
+  done
+done
+done
+done
+done
+done # boost_lib_
+rm -f conftest.$ac_objext
+])
+
+
+
+# --------------------------------------- #
+# Checks for the various Boost libraries. #
+# --------------------------------------- #
+
+# List of boost libraries: http://www.boost.org/libs/libraries.htm
+# The page http://beta.boost.org/doc/libs is useful: it gives the first release
+# version of each library (among other things).
+
+# BOOST_DEFUN(LIBRARY, CODE)
+# --------------------------
+# Define BOOST_<LIBRARY-UPPERCASE> as a macro that runs CODE.
+#
+# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN.
+m4_define([BOOST_DEFUN],
+[m4_indir([AC_DEFUN],
+          m4_toupper([BOOST_$1]),
+[m4_pushdef([BOOST_Library], [$1])dnl
+$2
+m4_popdef([BOOST_Library])dnl
+])
+])
+
+# BOOST_ARRAY()
+# -------------
+# Look for Boost.Array
+BOOST_DEFUN([Array],
+[BOOST_FIND_HEADER([boost/array.hpp])])
+
+
+# BOOST_ASIO()
+# ------------
+# Look for Boost.Asio (new in Boost 1.35).
+BOOST_DEFUN([Asio],
+[AC_REQUIRE([BOOST_SYSTEM])dnl
+BOOST_FIND_HEADER([boost/asio.hpp])])
+
+
+# BOOST_BIND()
+# ------------
+# Look for Boost.Bind.
+BOOST_DEFUN([Bind],
+[BOOST_FIND_HEADER([boost/bind.hpp])])
+
+
+# BOOST_CHRONO()
+# --------------
+# Look for Boost.Chrono.
+BOOST_DEFUN([Chrono],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+  BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([chrono], [$1],
+                [boost/chrono.hpp],
+                [boost::chrono::thread_clock d;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+  BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_CHRONO
+
+
+# BOOST_CONVERSION()
+# ------------------
+# Look for Boost.Conversion (cast / lexical_cast)
+BOOST_DEFUN([Conversion],
+[BOOST_FIND_HEADER([boost/cast.hpp])
+BOOST_FIND_HEADER([boost/lexical_cast.hpp])
+])# BOOST_CONVERSION
+
+
+# BOOST_CRC()
+# -----------
+# Look for Boost.CRC
+BOOST_DEFUN([CRC],
+[BOOST_FIND_HEADER([boost/crc.hpp])
+])# BOOST_CRC
+
+
+# BOOST_DATE_TIME([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Date_Time.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Date_Time],
+[BOOST_FIND_LIB([date_time], [$1],
+                [boost/date_time/posix_time/posix_time.hpp],
+                [boost::posix_time::ptime t;])
+])# BOOST_DATE_TIME
+
+
+# BOOST_FILESYSTEM([PREFERRED-RT-OPT])
+# ------------------------------------
+# Look for Boost.Filesystem.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+# Do not check for boost/filesystem.hpp because this file was introduced in
+# 1.34.
+BOOST_DEFUN([Filesystem],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+  BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([filesystem], [$1],
+                [boost/filesystem/path.hpp], [boost::filesystem::path p;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+  BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_FILESYSTEM
+
+
+# BOOST_FLYWEIGHT()
+# -----------------
+# Look for Boost.Flyweight.
+BOOST_DEFUN([Flyweight],
+[dnl There's a hidden dependency on pthreads.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+BOOST_FIND_HEADER([boost/flyweight.hpp])
+AC_SUBST([BOOST_FLYWEIGHT_LIBS], [$boost_cv_pthread_flag])
+])
+
+
+# BOOST_FOREACH()
+# ---------------
+# Look for Boost.Foreach.
+BOOST_DEFUN([Foreach],
+[BOOST_FIND_HEADER([boost/foreach.hpp])])
+
+
+# BOOST_FORMAT()
+# --------------
+# Look for Boost.Format.
+# Note: we can't check for boost/format/format_fwd.hpp because the header isn't
+# standalone.  It can't be compiled because it triggers the following error:
+# boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std'
+#                                                  does not name a type
+BOOST_DEFUN([Format],
+[BOOST_FIND_HEADER([boost/format.hpp])])
+
+
+# BOOST_FUNCTION()
+# ----------------
+# Look for Boost.Function
+BOOST_DEFUN([Function],
+[BOOST_FIND_HEADER([boost/function.hpp])])
+
+
+# BOOST_GEOMETRY()
+# ----------------
+# Look for Boost.Geometry (new since 1.47.0).
+BOOST_DEFUN([Geometry],
+[BOOST_FIND_HEADER([boost/geometry.hpp])
+])# BOOST_GEOMETRY
+
+
+# BOOST_GRAPH([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Graphs.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Graph],
+[BOOST_FIND_LIB([graph], [$1],
+                [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;])
+])# BOOST_GRAPH
+
+
+# BOOST_IOSTREAMS([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.IOStreams.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([IOStreams],
+[BOOST_FIND_LIB([iostreams], [$1],
+                [boost/iostreams/device/file_descriptor.hpp],
+                [boost::iostreams::file_descriptor fd; fd.close();])
+])# BOOST_IOSTREAMS
+
+
+# BOOST_HASH()
+# ------------
+# Look for Boost.Functional/Hash
+BOOST_DEFUN([Hash],
+[BOOST_FIND_HEADER([boost/functional/hash.hpp])])
+
+
+# BOOST_LAMBDA()
+# --------------
+# Look for Boost.Lambda
+BOOST_DEFUN([Lambda],
+[BOOST_FIND_HEADER([boost/lambda/lambda.hpp])])
+
+
+# BOOST_LOG([PREFERRED-RT-OPT])
+# -----------------------------
+# Look for Boost.Log.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log],
+[BOOST_FIND_LIB([log], [$1],
+    [boost/log/core/core.hpp],
+    [boost::log::attribute a; a.get_value();])
+])# BOOST_LOG
+
+
+# BOOST_LOG_SETUP([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Log.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log_Setup],
+[AC_REQUIRE([BOOST_LOG])dnl
+BOOST_FIND_LIB([log_setup], [$1],
+    [boost/log/utility/setup/from_settings.hpp],
+    [boost::log::basic_settings<char> bs; bs.empty();])
+])# BOOST_LOG_SETUP
+
+
+# BOOST_MATH()
+# ------------
+# Look for Boost.Math
+# TODO: This library isn't header-only but it comes in multiple different
+# flavors that don't play well with BOOST_FIND_LIB (e.g, libboost_math_c99,
+# libboost_math_c99f, libboost_math_c99l, libboost_math_tr1,
+# libboost_math_tr1f, libboost_math_tr1l).  This macro must be fixed to do the
+# right thing anyway.
+BOOST_DEFUN([Math],
+[BOOST_FIND_HEADER([boost/math/special_functions.hpp])])
+
+
+# BOOST_MPI([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost MPI.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.  Uses MPICXX variable if it is
+# set, otherwise tries CXX
+#
+BOOST_DEFUN([MPI],
+[boost_save_CXX=${CXX}
+boost_save_CXXCPP=${CXXCPP}
+if test x"${MPICXX}" != x; then
+  CXX=${MPICXX}
+  CXXCPP="${MPICXX} -E"
+fi
+BOOST_FIND_LIB([mpi], [$1],
+               [boost/mpi.hpp],
+               [int argc = 0;
+                char **argv = 0;
+                boost::mpi::environment env(argc,argv);])
+CXX=${boost_save_CXX}
+CXXCPP=${boost_save_CXXCPP}
+])# BOOST_MPI
+
+
+# BOOST_MULTIARRAY()
+# ------------------
+# Look for Boost.MultiArray
+BOOST_DEFUN([MultiArray],
+[BOOST_FIND_HEADER([boost/multi_array.hpp])])
+
+
+# BOOST_NUMERIC_UBLAS()
+# --------------------------
+# Look for Boost.NumericUblas (Basic Linear Algebra)
+BOOST_DEFUN([Numeric_Ublas],
+[BOOST_FIND_HEADER([boost/numeric/ublas/vector.hpp])
+])# BOOST_NUMERIC_UBLAS
+
+
+# BOOST_NUMERIC_CONVERSION()
+# --------------------------
+# Look for Boost.NumericConversion (policy-based numeric conversion)
+BOOST_DEFUN([Numeric_Conversion],
+[BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp])
+])# BOOST_NUMERIC_CONVERSION
+
+
+# BOOST_OPTIONAL()
+# ----------------
+# Look for Boost.Optional
+BOOST_DEFUN([Optional],
+[BOOST_FIND_HEADER([boost/optional.hpp])])
+
+
+# BOOST_PREPROCESSOR()
+# --------------------
+# Look for Boost.Preprocessor
+BOOST_DEFUN([Preprocessor],
+[BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])])
+
+
+# BOOST_UNORDERED()
+# -----------------
+# Look for Boost.Unordered
+BOOST_DEFUN([Unordered],
+[BOOST_FIND_HEADER([boost/unordered_map.hpp])])
+
+
+# BOOST_UUID()
+# ------------
+# Look for Boost.Uuid
+BOOST_DEFUN([Uuid],
+[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])])
+
+
+# BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT])
+# -----------------------------------------
+# Look for Boost.Program_options.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Program_Options],
+[BOOST_FIND_LIB([program_options], [$1],
+                [boost/program_options.hpp],
+                [boost::program_options::options_description d("test");])
+])# BOOST_PROGRAM_OPTIONS
+
+
+
+# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG)
+# ------------------------------------
+# Save VARIABLE, and define it via `python-config --FLAG`.
+# Substitute BOOST_PYTHON_VARIABLE.
+m4_define([_BOOST_PYTHON_CONFIG],
+[AC_SUBST([BOOST_PYTHON_$1],
+          [`python-config --$2 2>/dev/null`])dnl
+boost_python_save_$1=$$1
+$1="$$1 $BOOST_PYTHON_$1"])
+
+
+# BOOST_PYTHON([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.Python.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Python],
+[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes])
+_BOOST_PYTHON_CONFIG([LDFLAGS],   [ldflags])
+_BOOST_PYTHON_CONFIG([LIBS],      [libs])
+m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl
+BOOST_FIND_LIBS([python], [python python3], [$1],
+                [boost/python.hpp],
+                [], [BOOST_PYTHON_MODULE(empty) {}])
+CPPFLAGS=$boost_python_save_CPPFLAGS
+LDFLAGS=$boost_python_save_LDFLAGS
+LIBS=$boost_python_save_LIBS
+])# BOOST_PYTHON
+
+
+# BOOST_REF()
+# -----------
+# Look for Boost.Ref
+BOOST_DEFUN([Ref],
+[BOOST_FIND_HEADER([boost/ref.hpp])])
+
+
+# BOOST_REGEX([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Regex.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Regex],
+[BOOST_FIND_LIB([regex], [$1],
+                [boost/regex.hpp],
+                [boost::regex exp("*"); boost::regex_match("foo", exp);])
+])# BOOST_REGEX
+
+
+# BOOST_SERIALIZATION([PREFERRED-RT-OPT])
+# ---------------------------------------
+# Look for Boost.Serialization.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Serialization],
+[BOOST_FIND_LIB([serialization], [$1],
+                [boost/archive/text_oarchive.hpp],
+                [std::ostream* o = 0; // Cheap way to get an ostream...
+                boost::archive::text_oarchive t(*o);])
+])# BOOST_SERIALIZATION
+
+
+# BOOST_SIGNALS([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Signals.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Signals],
+[BOOST_FIND_LIB([signals], [$1],
+                [boost/signal.hpp],
+                [boost::signal<void ()> s;])
+])# BOOST_SIGNALS
+
+
+# BOOST_SIGNALS2()
+# ----------------
+# Look for Boost.Signals2 (new since 1.39.0).
+BOOST_DEFUN([Signals2],
+[BOOST_FIND_HEADER([boost/signals2.hpp])
+])# BOOST_SIGNALS2
+
+
+# BOOST_SMART_PTR()
+# -----------------
+# Look for Boost.SmartPtr
+BOOST_DEFUN([Smart_Ptr],
+[BOOST_FIND_HEADER([boost/scoped_ptr.hpp])
+BOOST_FIND_HEADER([boost/shared_ptr.hpp])
+])
+
+
+# BOOST_STATICASSERT()
+# --------------------
+# Look for Boost.StaticAssert
+BOOST_DEFUN([StaticAssert],
+[BOOST_FIND_HEADER([boost/static_assert.hpp])])
+
+
+# BOOST_STRING_ALGO()
+# -------------------
+# Look for Boost.StringAlgo
+BOOST_DEFUN([String_Algo],
+[BOOST_FIND_HEADER([boost/algorithm/string.hpp])
+])
+
+
+# BOOST_SYSTEM([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.System.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.  This library was introduced in Boost
+# 1.35.0.
+BOOST_DEFUN([System],
+[BOOST_FIND_LIB([system], [$1],
+                [boost/system/error_code.hpp],
+                [boost::system::error_code e; e.clear();])
+])# BOOST_SYSTEM
+
+
+# BOOST_TEST([PREFERRED-RT-OPT])
+# ------------------------------
+# Look for Boost.Test.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Test],
+[m4_pattern_allow([^BOOST_CHECK$])dnl
+BOOST_FIND_LIB([unit_test_framework], [$1],
+               [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);],
+               [using boost::unit_test::test_suite;
+               test_suite* init_unit_test_suite(int argc, char ** argv)
+               { return NULL; }])
+])# BOOST_TEST
+
+
+# BOOST_THREAD([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Thread.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Thread],
+[dnl Having the pthread flag is required at least on GCC3 where
+dnl boost/thread.hpp would complain if we try to compile without
+dnl -pthread on GNU/Linux.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+boost_thread_save_LIBS=$LIBS
+boost_thread_save_LDFLAGS=$LDFLAGS
+boost_thread_save_CPPFLAGS=$CPPFLAGS
+# Link-time dependency from thread to system was added as of 1.49.0.
+if test $boost_major_version -ge 149; then
+BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag"
+
+# When compiling for the Windows platform, the threads library is named
+# differently.
+case $host_os in
+  (*mingw*) boost_thread_lib_ext=_win32;;
+esac
+BOOST_FIND_LIBS([thread], [thread$boost_thread_lib_ext],
+                [$1],
+                [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+
+BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS"
+BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag"
+LIBS=$boost_thread_save_LIBS
+LDFLAGS=$boost_thread_save_LDFLAGS
+CPPFLAGS=$boost_thread_save_CPPFLAGS
+])# BOOST_THREAD
+
+AU_ALIAS([BOOST_THREADS], [BOOST_THREAD])
+
+
+# BOOST_TOKENIZER()
+# -----------------
+# Look for Boost.Tokenizer
+BOOST_DEFUN([Tokenizer],
+[BOOST_FIND_HEADER([boost/tokenizer.hpp])])
+
+
+# BOOST_TRIBOOL()
+# ---------------
+# Look for Boost.Tribool
+BOOST_DEFUN([Tribool],
+[BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp])
+BOOST_FIND_HEADER([boost/logic/tribool.hpp])
+])
+
+
+# BOOST_TUPLE()
+# -------------
+# Look for Boost.Tuple
+BOOST_DEFUN([Tuple],
+[BOOST_FIND_HEADER([boost/tuple/tuple.hpp])])
+
+
+# BOOST_TYPETRAITS()
+# --------------------
+# Look for Boost.TypeTraits
+BOOST_DEFUN([TypeTraits],
+[BOOST_FIND_HEADER([boost/type_traits.hpp])])
+
+
+# BOOST_UTILITY()
+# ---------------
+# Look for Boost.Utility (noncopyable, result_of, base-from-member idiom,
+# etc.)
+BOOST_DEFUN([Utility],
+[BOOST_FIND_HEADER([boost/utility.hpp])])
+
+
+# BOOST_VARIANT()
+# ---------------
+# Look for Boost.Variant.
+BOOST_DEFUN([Variant],
+[BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp])
+BOOST_FIND_HEADER([boost/variant.hpp])])
+
+
+# BOOST_POINTER_CONTAINER()
+# ------------------------
+# Look for Boost.PointerContainer
+BOOST_DEFUN([Pointer_Container],
+[BOOST_FIND_HEADER([boost/ptr_container/ptr_deque.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_list.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_vector.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_array.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_set.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_map.hpp])
+])# BOOST_POINTER_CONTAINER
+
+
+# BOOST_WAVE([PREFERRED-RT-OPT])
+# ------------------------------
+# NOTE: If you intend to use Wave/Spirit with thread support, make sure you
+# call BOOST_THREAD first.
+# Look for Boost.Wave.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Wave],
+[AC_REQUIRE([BOOST_FILESYSTEM])dnl
+AC_REQUIRE([BOOST_DATE_TIME])dnl
+boost_wave_save_LIBS=$LIBS
+boost_wave_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \
+$BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \
+$BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS"
+BOOST_FIND_LIB([wave], [$1],
+                [boost/wave.hpp],
+                [boost::wave::token_id id; get_token_name(id);])
+LIBS=$boost_wave_save_LIBS
+LDFLAGS=$boost_wave_save_LDFLAGS
+])# BOOST_WAVE
+
+
+# BOOST_XPRESSIVE()
+# -----------------
+# Look for Boost.Xpressive (new since 1.36.0).
+BOOST_DEFUN([Xpressive],
+[BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])])
+
+
+# ----------------- #
+# Internal helpers. #
+# ----------------- #
+
+
+# _BOOST_PTHREAD_FLAG()
+# ---------------------
+# Internal helper for BOOST_THREAD.  Computes boost_cv_pthread_flag
+# which must be used in CPPFLAGS and LIBS.
+#
+# Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3,
+# boost/thread.hpp will trigger a #error if -pthread isn't used:
+#   boost/config/requires_threads.hpp:47:5: #error "Compiler threading support
+#   is not turned on. Please set the correct command line options for
+#   threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)"
+#
+# Based on ACX_PTHREAD: http://autoconf-archive.cryp.to/acx_pthread.html
+AC_DEFUN([_BOOST_PTHREAD_FLAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_LANG_PUSH([C++])dnl
+AC_CACHE_CHECK([for the flags needed to use pthreads], [boost_cv_pthread_flag],
+[ boost_cv_pthread_flag=
+  # The ordering *is* (sometimes) important.  Some notes on the
+  # individual items follow:
+  # (none): in case threads are in libc; should be tried before -Kthread and
+  #       other compiler flags to prevent continual compiler warnings
+  # -lpthreads: AIX (must check this before -lpthread)
+  # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+  # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+  # -llthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+  # -pthread: GNU Linux/GCC (kernel threads), BSD/GCC (userland threads)
+  # -pthreads: Solaris/GCC
+  # -mthreads: MinGW32/GCC, Lynx/GCC
+  # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+  #      doesn't hurt to check since this sometimes defines pthreads too;
+  #      also defines -D_REENTRANT)
+  #      ... -mt is also the pthreads flag for HP/aCC
+  # -lpthread: GNU Linux, etc.
+  # --thread-safe: KAI C++
+  case $host_os in #(
+    *solaris*)
+      # On Solaris (at least, for some versions), libc contains stubbed
+      # (non-functional) versions of the pthreads routines, so link-based
+      # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+      # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+      # a function called by this macro, so we could check for that, but
+      # who knows whether they'll stub that too in a future libc.)  So,
+      # we'll just look for -pthreads and -lpthread first:
+      boost_pthread_flags="-pthreads -lpthread -mt -pthread";; #(
+    *)
+      boost_pthread_flags="-lpthreads -Kthread -kthread -llthread -pthread \
+                           -pthreads -mthreads -lpthread --thread-safe -mt";;
+  esac
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <pthread.h>],
+    [pthread_t th; pthread_join(th, 0);
+    pthread_attr_init(0); pthread_cleanup_push(0, 0);
+    pthread_create(0,0,0,0); pthread_cleanup_pop(0);])])
+  for boost_pthread_flag in '' $boost_pthread_flags; do
+    boost_pthread_ok=false
+dnl Re-use the test file already generated.
+    boost_pthreads__save_LIBS=$LIBS
+    LIBS="$LIBS $boost_pthread_flag"
+    AC_LINK_IFELSE([],
+      [if grep ".*$boost_pthread_flag" conftest.err; then
+         echo "This flag seems to have triggered warnings" >&AS_MESSAGE_LOG_FD
+       else
+         boost_pthread_ok=:; boost_cv_pthread_flag=$boost_pthread_flag
+       fi])
+    LIBS=$boost_pthreads__save_LIBS
+    $boost_pthread_ok && break
+  done
+])
+AC_LANG_POP([C++])dnl
+])# _BOOST_PTHREAD_FLAG
+
+
+# _BOOST_gcc_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_gcc_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC @ gcc$1$2"])dnl
+
+# _BOOST_mingw_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_mingw_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC && \
+  (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+         || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw$1$2"])dnl
+
+
+# _BOOST_FIND_COMPILER_TAG()
+# --------------------------
+# Internal.  When Boost is installed without --layout=system, each library
+# filename will hold a suffix that encodes the compiler used during the
+# build.  The Boost build system seems to call this a `tag'.
+AC_DEFUN([_BOOST_FIND_COMPILER_TAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag],
+[boost_cv_lib_tag=unknown
+if test x$boost_cv_inc_path != xno; then
+  AC_LANG_PUSH([C++])dnl
+  # The following tests are mostly inspired by boost/config/auto_link.hpp
+  # The list is sorted to most recent/common to oldest compiler (in order
+  # to increase the likelihood of finding the right compiler with the
+  # least number of compilation attempt).
+  # Beware that some tests are sensible to the order (for instance, we must
+  # look for MinGW before looking for GCC3).
+  # I used one compilation test per compiler with a #error to recognize
+  # each compiler so that it works even when cross-compiling (let me know
+  # if you know a better approach).
+  # Known missing tags (known from Boost's tools/build/v2/tools/common.jam):
+  #   como, edg, kcc, bck, mp, sw, tru, xlc
+  # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines
+  # the same defines as GCC's).
+  for i in \
+    _BOOST_mingw_test(4,8) \
+    _BOOST_gcc_test(4, 8) \
+    _BOOST_mingw_test(4,7) \
+    _BOOST_gcc_test(4, 7) \
+    _BOOST_mingw_test(4,6) \
+    _BOOST_gcc_test(4, 6) \
+    _BOOST_mingw_test(4,5) \
+    _BOOST_gcc_test(4, 5) \
+    _BOOST_mingw_test(4,4) \
+    _BOOST_gcc_test(4, 4) \
+    _BOOST_mingw_test(4,3) \
+    _BOOST_gcc_test(4, 3) \
+    _BOOST_mingw_test(4,2) \
+    _BOOST_gcc_test(4, 2) \
+    _BOOST_mingw_test(4,1) \
+    _BOOST_gcc_test(4, 1) \
+    _BOOST_mingw_test(4,0) \
+    _BOOST_gcc_test(4, 0) \
+    "defined __GNUC__ && __GNUC__ == 3 && !defined __ICC \
+     && (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+         || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw" \
+    _BOOST_gcc_test(3, 4) \
+    _BOOST_gcc_test(3, 3) \
+    "defined _MSC_VER && _MSC_VER >= 1500 @ vc90" \
+    "defined _MSC_VER && _MSC_VER == 1400 @ vc80" \
+    _BOOST_gcc_test(3, 2) \
+    "defined _MSC_VER && _MSC_VER == 1310 @ vc71" \
+    _BOOST_gcc_test(3, 1) \
+    _BOOST_gcc_test(3, 0) \
+    "defined __BORLANDC__ @ bcb" \
+    "defined __ICC && (defined __unix || defined __unix__) @ il" \
+    "defined __ICL @ iw" \
+    "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \
+    _BOOST_gcc_test(2, 95) \
+    "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \
+    "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \
+    "defined _MSC_VER && _MSC_VER < 1300 && defined UNDER_CE @ evc4" \
+    "defined __MWERKS__ && __MWERKS__ <= 0x31FF @ cw8"
+  do
+    boost_tag_test=`expr "X$i" : 'X\([[^@]]*\) @ '`
+    boost_tag=`expr "X$i" : 'X[[^@]]* @ \(.*\)'`
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if $boost_tag_test
+/* OK */
+#else
+# error $boost_tag_test
+#endif
+]])], [boost_cv_lib_tag=$boost_tag; break], [])
+  done
+AC_LANG_POP([C++])dnl
+  case $boost_cv_lib_tag in #(
+    # Some newer (>= 1.35?) versions of Boost seem to only use "gcc" as opposed
+    # to "gcc41" for instance.
+    *-gcc | *'-gcc ') :;; #(  Don't re-add -gcc: it's already in there.
+    gcc*)
+      boost_tag_x=
+      case $host_os in #(
+        darwin*)
+          if test $boost_major_version -ge 136; then
+            # The `x' added in r46793 of Boost.
+            boost_tag_x=x
+          fi;;
+      esac
+      # We can specify multiple tags in this variable because it's used by
+      # BOOST_FIND_LIB that does a `for tag in -$boost_cv_lib_tag' ...
+      boost_cv_lib_tag="$boost_tag_x$boost_cv_lib_tag -${boost_tag_x}gcc"
+      ;; #(
+    unknown)
+      AC_MSG_WARN([[could not figure out which toolset name to use for $CXX]])
+      boost_cv_lib_tag=
+      ;;
+  esac
+fi])dnl end of AC_CACHE_CHECK
+])# _BOOST_FIND_COMPILER_TAG
+
+
+# _BOOST_GUESS_WHETHER_TO_USE_MT()
+# --------------------------------
+# Compile a small test to try to guess whether we should favor MT (Multi
+# Thread) flavors of Boost.  Sets boost_guess_use_mt accordingly.
+AC_DEFUN([_BOOST_GUESS_WHETHER_TO_USE_MT],
+[# Check whether we do better use `mt' even though we weren't ask to.
+AC_LANG_PUSH([C++])dnl
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if defined _REENTRANT || defined _MT || defined __MT__
+/* use -mt */
+#else
+# error MT not needed
+#endif
+]])], [boost_guess_use_mt=:], [boost_guess_use_mt=false])
+AC_LANG_POP([C++])dnl
+])
+
+# _BOOST_AC_LINK_IFELSE(PROGRAM, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# -------------------------------------------------------------------
+# Fork of _AC_LINK_IFELSE that preserves conftest.o across calls.  Fragile,
+# will break when Autoconf changes its internals.  Requires that you manually
+# rm -f conftest.$ac_objext in between to really different tests, otherwise
+# you will try to link a conftest.o left behind by a previous test.
+# Used to aggressively optimize BOOST_FIND_LIB (see the big comment in this
+# macro).
+#
+# Don't use "break" in the actions, as it would short-circuit some code
+# this macro runs after the actions.
+m4_define([_BOOST_AC_LINK_IFELSE],
+[m4_ifvaln([$1], [AC_LANG_CONFTEST([$1])])dnl
+rm -f conftest$ac_exeext
+boost_save_ac_ext=$ac_ext
+boost_use_source=:
+# If we already have a .o, re-use it.  We change $ac_ext so that $ac_link
+# tries to link the existing object file instead of compiling from source.
+test -f conftest.$ac_objext && ac_ext=$ac_objext && boost_use_source=false &&
+  _AS_ECHO_LOG([re-using the existing conftest.$ac_objext])
+AS_IF([_AC_DO_STDERR($ac_link) && {
+         test -z "$ac_[]_AC_LANG_ABBREV[]_werror_flag" ||
+         test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+         test "$cross_compiling" = yes ||
+         $as_executable_p conftest$ac_exeext
+dnl FIXME: use AS_TEST_X instead when 2.61 is widespread enough.
+       }],
+      [$2],
+      [if $boost_use_source; then
+         _AC_MSG_LOG_CONFTEST
+       fi
+       $3])
+ac_objext=$boost_save_ac_objext
+ac_ext=$boost_save_ac_ext
+dnl Delete also the IPA/IPO (Inter Procedural Analysis/Optimization)
+dnl information created by the PGI compiler (conftest_ipa8_conftest.oo),
+dnl as it would interfere with the next link command.
+rm -f core conftest.err conftest_ipa8_conftest.oo \
+      conftest$ac_exeext m4_ifval([$1], [conftest.$ac_ext])[]dnl
+])# _BOOST_AC_LINK_IFELSE
+
+# Local Variables:
+# mode: autoconf
+# End:
diff --git a/man/M6502_delete.3 b/man/M6502_delete.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_delete.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_disassemble.3 b/man/M6502_disassemble.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_disassemble.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_dump.3 b/man/M6502_dump.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_dump.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_getCallback.3 b/man/M6502_getCallback.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_getCallback.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_getVector.3 b/man/M6502_getVector.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_getVector.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_irq.3 b/man/M6502_irq.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_irq.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_new.3 b/man/M6502_new.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_new.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_nmi.3 b/man/M6502_nmi.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_nmi.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_reset.3 b/man/M6502_reset.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_reset.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_run.3 b/man/M6502_run.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_run.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setCallback.3 b/man/M6502_setCallback.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setCallback.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setMode.3 b/man/M6502_setMode.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setMode.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setVector.3 b/man/M6502_setVector.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setVector.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/lib6502.3 b/man/lib6502.3
new file mode 100644
index 0000000..4551a3d
--- /dev/null
+++ b/man/lib6502.3
@@ -0,0 +1,555 @@
+.\" Copyright (c) 2005 Ian Piumarta
+.\" Copyright (c) 2014 Steven Flintham
+.\" 
+.\" Permission is hereby granted, free of charge, to any person
+.\" obtaining a copy of this software and associated documentation
+.\" files (the 'Software'), to deal in the Software without
+.\" restriction, including without limitation the rights to use, copy,
+.\" modify, merge, publish, distribute, and/or sell copies of the
+.\" Software, and to permit persons to whom the Software is furnished
+.\" to do so, provided that the above copyright notice(s) and this
+.\" permission notice appear in all copies of the Software and that
+.\" both the above copyright notice(s) and this permission notice
+.\" appear in supporting documentation.
+.\" 
+.\" THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+.\"
+.Dd June 7, 2014
+.Dt LIB6502 3 LOCAL
+.Os ""
+.\" ----------------------------------------------------------------
+.Sh NAME
+.\"
+.Nm lib6502
+.Nd 6502 microprocessor emulator
+.\" ----------------------------------------------------------------
+.Sh SYNOPSIS
+.\"
+.In stdint.h
+.In lib6502.h
+.Ft M6502 *
+.Fn M6502_new "M6502_Registers *registers" "M6502_Memory memory" "M6502_Callbacks *callbacks"
+.Ft void
+.Fn M6502_reset "M6502 *mpu"
+.Ft void
+.Fn M6502_nmi "M6502 *mpu"
+.Ft void
+.Fn M6502_irq "M6502 *mpu"
+.Ft uint16_t
+.Fn M6502_getVector "M6502 *mpu" "vector"
+.Ft uint16_t
+.Fn M6502_setVector "M6502 *mpu" "vector" "uint16_t address"
+.Ft M6502_Callback
+.Fn M6502_getCallback "M6502 *mpu" "type" "uint16_t address"
+.Ft M6502_Callback
+.Fn M6502_setCallback "M6502 *mpu" "type" "uint16_t address" "M6502_Callback callback"
+.Ft void
+.Fn M6502_run "M6502 *mpu"
+.Ft int
+.Fn M6502_disassemble "M6502 *mpu" "uint16_t address" "char buffer[64]"
+.Ft void
+.Fn M6502_dump "M6502 *mpu" "char buffer[64]"
+.Ft void
+.Fn M6502_delete "M6502 *mpu"
+.Ft void
+.Fn M6502_setMode "M6502 *mpu" "M6502_Mode mode" "int arg"
+.\" ----------------------------------------------------------------
+.Sh DESCRIPTION
+.\"
+.Fn M6502_new
+creates an instance of a 6502 microprocessor.
+.Fn M6502_reset ,
+.Fn M6502_nmi
+and 
+.Fn M6502_irq
+place it into the states associated with the hardware signals for
+reset, non-maskable interrupt and interrupt request, respectively.
+The macros
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+read and write the vectors through which the processor jumps in
+response to the above signals.  The macros
+.Fn M6502_getCallback
+and
+.Fn M6502_setVector
+read and write client-supplied functions that intercept accesses to
+memory.
+.Fn M6502_run
+begins emulated execution.
+.Fn M6502_dump
+and
+.Fn M6502_disassemble
+create human-readable representations of processor or memory state.
+.Fn M6502_delete
+frees all resources associated with a processor instance.  
+.Fn M6502_setMode
+specifies the emulation mode to use for a processor instance.  Each of
+these functions and macros is described in more detail below.
+.Pp
+.Fn M6502_new
+returns a pointer to a
+.Fa M6502
+structure containing at least the following members:
+.Bd -literal
+struct _M6502
+{
+    M6502_Registers  *registers;   /* processor state */
+    uint8_t          *memory;      /* memory image */
+    M6502_Callbacks  *callbacks;   /* r/w/x/illegal callbacks */
+};
+.Ed
+.Pp
+These members are initialised according to the supplied
+.Fa registers ,
+.Fa memory
+and
+.Fa callbacks
+arguments.  If a given argument is NULL, the corresponding member is
+initialised automatically with a suitable (non-NULL) value.
+.Pp
+The members of
+.Fa M6502
+are as follows:
+.Bl -tag -width ".Fa callbacks"
+.It Fa registers
+the processor state, containing all registers and condition codes.
+.It Fa memory
+a block of at least 64 kilobytes of storage containing the processor's
+memory.  (An array type
+.Vt M6502_Memory,
+suitable for defining values to pass as the
+.Fa memory
+argument, is defined in the
+.In lib6502.h
+include file.)
+.It Fa callbacks
+a structure mapping processor memory accesses to client callback
+functions.
+.El
+.Pp
+Access to the contents of the
+.Fa registers
+and
+.Fa memory
+members can be made directly.
+The
+.Fa registers
+member is a
+.Vt M6502_Registers
+containing the following members:
+.Bd -literal
+struct _M6502_Registers
+{
+    uint8_t   a;  /* accumulator */
+    uint8_t   x;  /* X index register */
+    uint8_t   y;  /* Y index register */
+    uint8_t   p;  /* processor status register */
+    uint8_t   s;  /* stack pointer */
+    uint16_t pc;  /* program counter */
+};
+.Ed
+.Pp
+The
+.Fa memory
+member is an array of
+.Vt unsigned char
+and can be indexed directly.  In addition, two convenience macros
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+provide access to the reset and interrupt vectors within
+.Fa memory .
+.Fn M6502_getVector
+returns the address stored in the named
+.Fa vector
+which must be precisely one of the following:
+.Bl  -tag -width ".Dv RST" -offset indent
+.It Dv RST
+the reset vector.
+.It Dv NMI
+the non-maskable interrupt vector.
+.It Dv IRQ
+the interrupt request vector.
+.El
+.Pp
+.Fn M6502_setVector
+stores its
+.Fa address
+argument in the named
+.Fa vector
+and returns the new value.
+.Pp
+The
+.Fa callbacks
+member contains an opaque structure mapping processor memory accesses
+to client callback functions.  Whenever the processor performs an
+access for which a corresponding entry exists in the the
+.Fa callbacks
+structure, the emulator suspends execution and invokes the callback to
+complete the operation.  Each callback function should have a
+signature equivalent to:
+.Bd -ragged -offset indent
+int
+.Va callback
+(M6502 *mpu, uint16_t address, uint8_t data);
+.Ed
+.Pp
+The macros
+.Fn M6502_getCallback
+and
+.Fn M6502_setCallback
+read and write entries in the
+.Fa callbacks
+structure.  These macros identify a unique memory access operation
+from the specified
+.Fa address
+on which it operates and
+.Fa type
+of access involved.  The
+.Fa type
+argument must be one of the following:
+.Bl -tag -width ".Dv write"
+.It Dv read
+the
+.Fa callback
+is invoked when the processor attempts to read from the
+given address.  The emulator passes the effective address of the
+operation to the callback in its
+.Fa address
+argument.  (The
+.Fa data
+argument is undefined.)  The value returned by the callback will be
+used by the emulator as the result of the read operation.
+.It Dv write
+the
+.Fa callback
+is invoked when the processor attempts to write to the
+given address.  The emulator passes the effective address of the
+operation to the callback in its
+.Fa address
+argument and the byte being written in the
+.Fa data
+argument.  The emulator will not perform the write operation before
+invoking the callback; if the write should complete, the callback must
+modify the processor's
+.Fa memory
+explicitly.  The valued returned from the callback is ignored.
+.It Dv call
+the
+.Fa callback
+is invoked when the processor attempts to transfer control to the
+given address by any instruction other than a relative branch.  The
+emulator passes the destination address to the callback in its
+.Fa address
+argument and the instruction that initiated the control transfer in
+its
+.Fa data
+argument (one of JMP, JSR, BRK, RTS or RTI).  If the callback returns
+zero (the callback refuses to handle the operation) the emulator will
+allow the operation to complete as normal.  If the callback returns a
+non-zero address (indicating that the callback has handled the
+operation internally) the emulator will transfer control to that
+address.
+.It Dv illegal_instruction
+the
+.Fa callback
+is invoked when the processor attempts to execute the illegal instruction
+whose opcode is the given "address".  The emulator passes the address of the
+instruction to the callback in its
+.Fa address
+argument and the instruction itself in the
+.Fa data
+argument.  If the callback returns a non-zero address the 
+emulator will transfer control to that address, otherwise execution will 
+continue at the next instruction.
+.El
+.Pp
+.Fn M6502_getCallback
+returns zero if there is no callback associated with the given
+.Fa type
+and
+.Fa address .
+Passing zero as the
+.Fa callback
+argument of
+.Fn M6502_setCallback
+removes any callback that might have been associated with
+.Fa type
+and
+.Fa address .
+.Pp
+.Fn M6502_run
+emulates processor execution in the given
+.Fa mpu
+by repeatedly fetching the instruction addressed by
+.Fa pc
+and dispatching to it.  This function normally never returns.
+.Pp
+.Fn M6502_dump
+writes a (NUL-terminated) symbolic representation of the processor's
+internal state into the supplied
+.Fa buffer .
+Typical output resembles:
+.Bd -literal -offset indent
+PC=1010 SP=01FE A=0A X=5B Y=00 P=D1 NV-B---C
+.Ed
+.Pp
+.Fn M6502_disassemble
+writes a (NUL-terminated) symbolic representation of the instruction
+in the processor's memory at the given
+.Fa address
+into the supplied
+.Fa buffer .
+It returns the size (in bytes) of the instruction.  (In other words,
+the amount by which
+.Fa address
+should be incremented to arrive at the next instruction.)
+Typical output resembles:
+.Bd -literal -offset indent
+1009 cpx #5B
+.Ed
+.Pp
+(The
+.Fa buffer
+arguments are oversized to allow for future expansion.)
+.Pp
+.Fn M6502_delete
+frees the resources associated with the given
+.Fa mpu.
+Any members that were allocated implicitly (passed as NULL to
+.Fn M6502_new )
+are deallocated.  Members that were initialised from non-NULL
+arguments are not deallocated.
+.Pp
+.Fn M6502_setMode
+is a lib6502-jit extension which sets the emulation mode to use for the
+instance to
+.Fa mode ,
+which must be precisely one of the following:
+.Bl  -tag -width ".Dv RST" -offset indent
+.It Dv M6502_ModeInterpreted
+6502 code will be interpreted, much as in lib6502 itself.
+.It Dv M6502_ModeCompiled
+6502 code will always be compiled to host code before executing.  This can result
+in jerky execution as emulation halts during compilation.  Self-modifying code
+will work correctly, but if this happens a lot the repeated re-compilations
+will result in very slow execution.
+.It Dv M6502_ModeHybrid
+6502 code will be compiled to host code but the interpreter will be used to
+continue execution during compilation.  Execution will be smooth and relatively
+fast but performance of repeatedly executed code will vary (in theory, improve)
+over time.  Repeated self-modification by code will cause re-compilations but
+performance will still be reasonable as the interpreter will continue execution;
+the main downside is that CPU will be taken up by the compilation.  (On a
+machine with two or more idle cores, this is wasteful but should not
+significantly harm performance, as one core will run the interpreter while the
+other handles the compilation.)  This is the default mode.
+.El
+.Pp
+.Fa arg
+is the maximum number of 6502 instructions to be compiled into a single unit
+of code when hybrid or compiled mode is selected; it is ignored in interpreted
+mode.  Specifying 0 will give a reasonable default value.
+.Pp
+.\" ----------------------------------------------------------------
+.Sh IMPLEMENTATION NOTES
+.\" 
+You can share the
+.Fa memory
+and
+.Fa callbacks
+members of
+.Vt M6502
+between multiple instances to simulate multiprocessor hardware.
+.\" ----------------------------------------------------------------
+.Sh RETURN VALUES
+.\" 
+.Fn M6502_new
+returns a pointer to a
+.Vt M6502
+structure.
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+return the contents of the given
+.Fa vector .
+.Fn M6502_getCallback
+and
+.Fn M6502_setCallback
+return the
+.Vt M6502_Callback
+function associated with the given
+.Fa address
+and access
+.Fa type .
+.Fn M6502_disassemble
+returns the size (in bytes) of the instruction at the given
+.Fa address .
+.Fn M6502_reset ,
+.Fn M6502_nmi ,
+.Fn M6502_irq ,
+.Fn M6502_run ,
+.Fn M6502_dump,
+.Fn M6502_delete
+and
+.Fn M6502_setMode
+don't return anything (unless you forgot to include
+.In lib6502.h ) .
+.\" ----------------------------------------------------------------
+.Sh EXAMPLES
+.\" 
+The following program creates a 6502 processor, sets up callbacks for
+printing characters and halting after a BRK instruction, stores a
+program into memory that prints the alphabet, disassembles the program
+on stdout, and then executes the program.
+.Bd -literal -offset indent -compact
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "lib6502.h"
+
+#define WRCH    0xFFEE
+
+int wrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  int pc;
+  putchar(mpu->registers->a);
+  pc  = mpu->memory[++mpu->registers->s + 0x100];
+  pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;
+  return pc + 1;  /* JSR pushes next insn addr - 1 */
+}
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump(mpu, buffer);
+  printf("\\nBRK instruction\\n%s\\n", buffer);
+  exit(0);
+}
+
+int main(int argc, char **argv)
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  unsigned  pc  = 0x1000;
+
+  mpu->callbacks->call[WRCH] = wrch;     /* write character */
+  mpu->callbacks->call[0000] = done;     /* reached after BRK */
+
+# define gen1(X)        (mpu->memory[pc++] = (uint8_t)(X))
+# define gen2(X,Y)      gen1(X); gen1(Y)
+# define gen3(X,Y,Z)    gen1(X); gen2(Y,Z)
+
+  gen2(0xA2, 'A'     );  /* LDX #'A'   */
+  gen1(0x8A          );  /* TXA        */
+  gen3(0x20,0xEE,0xFF);  /* JSR FFEE   */
+  gen1(0xE8          );  /* INX        */
+  gen2(0xE0, 'Z'+1   );  /* CPX #'Z'+1 */
+  gen2(0xD0, -9      );  /* BNE 1002   */
+  gen2(0xA9, '\\n'    );  /* LDA #'\\n'  */
+  gen3(0x20,0xEE,0xFF);  /* JSR FFEE   */
+  gen2(0x00,0x00     );  /* BRK        */
+
+  {
+    uint16_t ip = 0x1000;
+    while (ip < pc)
+      {
+        char insn[64];
+        ip += M6502_disassemble(mpu, ip, insn);
+        printf("%04X %s\\n", ip, insn);
+      }
+  }
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);
+
+  return 0;
+}
+.Ed
+.\" ----------------------------------------------------------------
+.Sh DIAGNOSTICS
+.\" 
+If
+.Fn M6502_new
+cannot allocate sufficient memory it prints "out of memory" to stderr
+and exits with a non-zero status.
+.Pp
+If
+.Fn M6502_run
+encounters an illegal or undefined instruction, it prints "undefined
+instruction" and the processor's state to stderr, then exits with a
+non-zero status.
+.\" ----------------------------------------------------------------
+.Sh COMPATIBILITY
+.\" 
+M6502 is a generic name. The initial letter is mandated by C naming
+conventions and chosen in deference to MOS Technology, the original
+designers of the processor.  To the best of my knowledge the 'M'
+prefix was never stamped on a physical 6502.
+.Pp
+The emulator implements the CMOS version of the processor (NMOS bugs
+in effective address calculations involving page boundaries are
+corrected).  lib6502 does not tolerate the execution of undefined
+instructions (which were all no-ops in the first-generation CMOS
+hardware); lib6502-jit treats them as no-ops.  It would be nice to
+support the several alternative instruction sets (model-specific
+undocumented instructions in NMOS models, and various documented
+extensions in the later CMOS models) but there are currently no plans
+to do so.
+.Pp
+The emulated 6502 will run much faster than real hardware on any
+modern computer.  The fastest 6502 hardware available at the time of
+writing has a clock speed of 14 MHz.  On a 2 GHz PowerPC, the emulated
+6502 runs at almost 300 MHz (in interpreted mode).
+.\" ----------------------------------------------------------------
+.Sh SEE ALSO
+.\" 
+.Xr run6502 1
+.Pp
+For development tools, documentation and source code:
+.Pa http://6502.org
+.\" ----------------------------------------------------------------
+.Sh AUTHORS
+.\" 
+The original lib6502 software and manual pages were written by Ian Piumarta.
+Additional changes to create lib6502-jit were made by Steven Flintham.
+.Pp
+The software is provided as-is, with absolutely no warranty, in the
+hope that you will enjoy and benefit from it.  You may use (entirely
+at your own risk) and redistribute it under the terms of a very
+liberal license that does not seek to restrict your rights in any way
+(unlike certain so-called 'open source' licenses that significantly
+limit your freedom in the name of 'free' software that is, ultimately,
+anything but free).  See the file COPYING for details.
+.\" ----------------------------------------------------------------
+.Sh BUGS
+.\" 
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+evaluate their arguments more than once.
+.Pp
+The out-of-memory condition and attempted execution of
+illegal/undefined instructions should not be fatal errors.
+.Pp
+There is no way to limit the duration of execution within
+.Fn M6502_run
+to a certain number of instructions or cycles.
+.Pp
+The emulator should support some means of implicit interrupt
+generation, either by polling or in response to (Unix) signals.
+.Pp
+The
+.Sx COMPATIBILITY
+section in this manual page has been diverted from its legitimate
+purpose.
+.Pp
+The plural of 'callback' really aught to be 'callsback'.
+.Pp
+Please send bug reports (and feature requests) to :
+lib6502-jit@lemma.co.uk.
diff --git a/man/run6502.1 b/man/run6502.1
new file mode 100644
index 0000000..98f761f
--- /dev/null
+++ b/man/run6502.1
@@ -0,0 +1,396 @@
+.\" Copyright (c) 2005 Ian Piumarta
+.\" Copyright (c) 2014 Steven Flintham
+.\" 
+.\" Permission is hereby granted, free of charge, to any person
+.\" obtaining a copy of this software and associated documentation
+.\" files (the 'Software'), to deal in the Software without
+.\" restriction, including without limitation the rights to use, copy,
+.\" modify, merge, publish, distribute, and/or sell copies of the
+.\" Software, and to permit persons to whom the Software is furnished
+.\" to do so, provided that the above copyright notice(s) and this
+.\" permission notice appear in all copies of the Software and that
+.\" both the above copyright notice(s) and this permission notice
+.\" appear in supporting documentation.
+.\" 
+.\" THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+.\"
+.Dd October 31, 2005
+.Dt RUN6502 1 LOCAL
+.Os ""
+.\" ----------------------------------------------------------------
+.Sh NAME
+.\" 
+.Nm run6502
+.Nd execute a 6502 microprocessor program
+.\" ----------------------------------------------------------------
+.Sh SYNOPSIS
+.\" 
+.Nm run6502
+.Op Ar option ...
+.Nm run6502
+.Op Ar option ...
+.Fl B
+.Op Ar
+.\" ----------------------------------------------------------------
+.Sh DESCRIPTION
+The
+.Nm run6502
+command emulates the execution of a 6502 microprocessor.  It creates a
+memory image from the contents of one or more files on the command
+line and then simulates a power-on hardware reset to begin execution.
+.Pp
+In its first form,
+.Nm run6502
+emulates an embedded 6502 processor with 64 kilobytes of RAM, no
+memory-mapped hardware, and no input-output capabilities.  Limited
+interaction with the machine is possible only through the
+.Fl G , M
+and
+.Fl P
+options.
+.Pp
+In its second form (with the
+.Fl B
+option)
+.Nm run6502
+provides minimal emulation of Acorn 'BBC Model B' hardware with 32
+kilobytes of RAM, 16 kilobytes of paged language ROMs, and 16
+kilobytes of operating system ROM.  A few MOS calls are intercepted to
+provide keyboard input and screen output via stdin and stdout.
+Switching between the sixteen paged read-only memory banks is also
+supported by the usual memory-mapped control register.  Any
+.Ar file
+arguments after the
+.Fl B
+are loaded into successive paged ROM banks (starting at 15 and working
+down towards 0) before execution begins.
+.\" ----------------------------------------------------------------
+.Ss Options
+.\" 
+.Bl -tag -width indent
+.It Fl B
+enable minimal Acorn 'BBC Model B' hardware emulation:
+.Bl -bullet
+.It
+the contents of memory between addresses 0x8000 and 0xBFFF are copied
+into paged ROM number 0;
+.It
+memory between 0x8000 and 0xBFFF becomes bank-switchable between
+sixteen different ROM images;
+.It
+the memory-mapped pages ('FRED', 'JIM' and 'SHEILA') between 0xFC00
+and 0xFEFF are initialised to harmless values;
+.It
+the upper half of the address space is write-protected; and
+.It
+callbacks are installed on several OS entry points to provide
+input-output via stdin and stdout.
+.El
+.Pp
+Any remaining non-option arguments on the command line will name files
+to be loaded successively into paged ROMs, starting at 15 and working
+downwards towards 0.
+.It Fl d Ar addr Ar end
+dump memory from the address
+.Ar addr
+(given in hexadecimal) up to (but not including)
+.Ar end .
+The
+.Ar end
+argument is either an absolute address or a relative address specified
+as a '+' character followed by the number (in hexadecimal) of bytes to
+dump.  In other words, the following two options dump the same region
+of memory:
+.Bd -ragged -offset indent
+.Fl d
+8000  C000
+.Ed
+.Bd -ragged -offset indent -compact
+.Fl d
+8000 +4000
+.Ed
+.Pp
+The format of the dump cannot currently be modified and consists of
+the current address followed by one, two or three hexadecimal bytes,
+and a symbolic representation of the instruction at that address.
+.It Fl G Ar addr
+arrange that subroutine calls to
+.Ar addr
+will behave as if there were an implementation of
+.Xr getchar 3
+at that address, reading a character from stdin and returning it in
+the accumulator.
+.It Fl h
+print a summary of the available options and then exit.
+.It Fl I Ar addr
+set the IRQ (interrupt request) vector (the address to which the
+processor will transfer control upon execution of a BRK instruction).
+Setting this address to zero will cause execution to halt (and the
+emulator to exit) when a BRK instruction is encountered.
+.It Fl i Ar addr Ar file
+Load
+.Ar file
+into the memory image at the address
+.Ar addr
+(in hexadecimal), skipping over any initial '#!' interpreter line.
+.It Fl l Ar addr Ar file
+Load
+.Ar file
+into the memory image at the address
+.Ar addr
+(in hexadecimal).
+.It Fl M Ar addrio
+arrange that memory reads from address
+.Ar addrio
+will return the next character on stdin (blocking if necessary), and
+memory writes to
+.Ar addrio
+will send the value written to stdout.
+.It Fl mc
+use compiled emulation mode.  All code is compiled into host machine
+code.  This can make the emulation very jerky as execution halts
+while compiling.
+.It Fl mh
+use hybrid emulation mode.  Code is compiled into
+host machine code, but while this is happening an interpreter allows
+execution to continue.  This is the default mode.
+.It Fl mi
+use interpreted emulation mode. All code is interpreted.
+.It Fl mx Ar count
+in compiled and hybrid emulation modes, set the maximum number of
+6502 instructions which are translated as a unit to
+.Ar count .
+This has no effect in interpreted mode. A reasonable default is
+chosen if this is not specified.
+.It Fl N Ar addr
+set the NMI (non-maskable interrupt) vector to
+.Ar addr .
+.It Fl P Ar addr
+arrange that subroutine calls to
+.Ar addr
+will behave as if there were an implementation of
+.Xr putchar 3
+at that address, writing the contents of the accumulator to stdout.
+.It Fl R Ar addr
+set the RST (hardware reset) vector.  The processor will transfer
+control to this address when emulated execution begins.
+.It Fl s Ar addr Ar end Ar file
+save the contents of memory from the address
+.Ar addr
+up to
+.Ar end
+(exclusive) to the given
+.Ar file .
+As with the
+.Fl d
+option,
+.Ar end
+can be absolute or '+' followed by a byte count.
+.It Fl v
+print version information and then exit.
+.It Fl X Ar addr
+arrange that any transfer of control to the address
+.Ar addr
+will cause an immediate exit with zero exit status.
+.It Fl x
+exit immediately.  (Useful after
+.Fl d
+or when
+.Nm run6502
+is being used as a trivial 'image editor', with several
+.Fl l
+options followed by
+.Fl s
+and
+.Fl x . )
+.It Ar
+following a
+.Fl B
+option, load one or more ROM image
+files
+into successive paged ROM slots.  Other than the paging aspect, this
+is equivalent to:
+.Bd -ragged -offset indent
+.Fl l Ar 8000 Ar image
+.Ed
+.El
+.\" ----------------------------------------------------------------
+.Sh EXAMPLES
+.\" 
+.Ss A Very Simple Program
+The
+.Xr perl 1
+command can be used to create a binary file from hexadecimal input:
+.Bd -literal
+    echo a2418a20eeffe8e05bd0f7a90a20eeff00 |
+    perl -e 'print pack "H*",<STDIN>' > temp.img
+.Ed
+.Pp
+The file can be loaded and executed with:
+.Bd -literal
+    run6502 -l 1000 temp.img -R 1000 -P FFEE -X 0
+.Ed
+.Pp
+The contents of the file can be inspected symbolically with:
+.Bd -literal
+    run6502 -l 1000 temp.img -d 1000 +12
+.Ed
+.Pp
+The options passed to
+.Nm run6502
+in the above examples have the following effects:
+.Bl -tag -width offset
+.It \-l 1000 temp.img
+loads the file
+.Pa temp.img
+into memory at address 0x8000.
+.It \-R 1000
+sets the reset vector (the address of first instruction to be executed
+after 'power on') to 0x1000.
+.It \-P FFEE
+arranges for calls to address 0xFFEE to behave as if there were an
+implementation of
+.Xr putchar 3
+at that address.
+.It \-X 0
+arranges for transfers of control to address 0 to exit from the
+emulator.  This works in the above example because the final 'BRK'
+instruction causes an implicit subroutine call through an
+uninitialised interrupt vector to location 0.  To see this
+instruction...
+.It \-d 1000 +12
+disassembles 18 bytes of memory at address 0x8000.
+.El
+.Ss Standalone Images
+The
+.Fl i
+option is designed for use in the 'interpreter command' appearing on
+the first line of an executable script.  Adding the line
+.Bd -literal
+    #!run6502 -R 1000 -P FFEE -X 0 -i 1000
+.Ed
+.Pp
+(with no leading spaces and a single trailing newline character)
+to the
+.Pa temp.img
+file from the first example turns it into a script.  If the file is
+made executable with
+.Bd -literal
+    chmod +x temp.img
+.Ed
+.Pp
+it can be run like a standalone program:
+.Bd -literal
+    ./temp.img
+.Ed
+.Ss A Very Complex Program
+Consider a pair of files named
+.Pa os1.2
+and
+.Pa basic2
+containing (legally-acquired, of course) ROM images of Acorn MOS 1.2
+and BBC Basic 2.  The following command loads each of the images into
+memory at the appropriate address, cleans up the regions of memory
+containing memory-mapped i/o on the BBC computer, saves a snapshot of
+the entire memory to the file
+.Pa image 
+and then exits:
+.Bd -literal
+    run6502 -l C000 os1.2 -l 8000 basic2 -B -s0 +10000 image -x
+.Ed
+.Pp
+Running the generated image with
+.Bd -literal
+    run6502 image
+.Ed
+.Pp
+will cold-start the emulated hardware, run the OS for a while, and
+then drop into the language ROM.  Basic programs can then be entered,
+edited and run from the terminal.
+.Pp
+More details are given in the
+.Pa README
+file available in the
+.Pa examples
+directory of the distribution.
+.Ss Exercises
+Create a standalone image (one that can be run as a program, with
+a '#!' interpreter line at the beginning) that contains Basic2 and
+OS1.2 (as described above).  This image should be no larger than 32K
+(memory below 0x8000, which would be full of zeroes, should not appear
+in the image file).
+.\" ----------------------------------------------------------------
+.Sh DIAGNOSTICS
+.\" 
+If nothing goes wrong, none.  Otherwise lots.  They should be
+self-explanatory.  I'm too lazy to enumerate them.
+.\" ----------------------------------------------------------------
+.Sh COMPATIBILITY
+.\" 
+See
+.Xr lib6502 3
+for a discussion of the emulated instruction set.
+.\" ----------------------------------------------------------------
+.Sh SEE ALSO
+.\" 
+.Xr lib6502 3
+.Pp
+The file
+.Pa examples/README
+in the lib6502 distribution.  (Depending on your system this may be
+installed in
+.Pa /usr/doc/lib6502 ,
+.Pa /usr/local/doc/lib6502 ,
+.Pa /usr/share/doc/lib6502 ,
+or similar.)
+.Pp
+.Pa http://piumarta.com/software/lib6502
+for updates and documentation to lib6502.
+.Pp
+.Pa https://github.com/ZornsLemma/lib6502-jit
+for updates and documentation to lib6502-jit.
+.Pp
+.Pa http://6502.org
+for lots of 6502-related resources.
+.\" ----------------------------------------------------------------
+.Sh AUTHORS
+.\" 
+The original lib6502 software and manual pages were written by Ian Piumarta.
+Additional changes to create lib6502-jit were made by Steven Flintham.
+.Pp
+The software is provided as-is, with absolutely no warranty, in the
+hope that you will enjoy and benefit from it.  You may use (entirely
+at your own risk) and redistribute it under the terms of a very
+liberal license that does not seek to restrict your rights in any way
+(unlike certain so-called 'open source' licenses that significantly
+limit your freedom in the name of 'free' software that is, ultimately,
+anything but free).  See the file COPYING for details.
+.\" ----------------------------------------------------------------
+.Sh BUGS
+.\" 
+.Bl -bullet
+.It
+Options must appear one at a time.
+.It
+Any attempt (in a load or save operation) to transfer data beyond
+0xFFFF is silently truncated at the end of memory.
+.It
+There is no way to specify the slot into which a ROM image should be
+loaded, other than implicitly according to the order of arguments on
+the command line.
+.It
+Execution can only be started via the emulated power-up reset.  There
+is no support for 'warm-starting' execution in an image at an
+arbitrary address.
+.It
+Even though the emulator fully supports them, there is no way to
+artificially generate a hardware interrupt request, non-maskable
+interrupt, or reset condition.  If you need these, read
+.Xr lib6502 3
+and write your own shell.
+.It
+The Acorn 'BBC Model B' hardware emulation is totally lame.
+.El
+.Pp
+Please send bug reports (and feature requests) to :
+lib6502-jit@lemma.co.uk.
diff --git a/run6502.c b/run6502.c
new file mode 100644
index 0000000..2e3731a
--- /dev/null
+++ b/run6502.c
@@ -0,0 +1,599 @@
+/* run6502.c -- 6502 emulator shell			-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* Last edited: 2005-11-02 01:18:58 by piumarta on margaux.local
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/wait.h>
+#include <time.h>
+
+#include "config.h"
+#include "lib6502.h"
+
+#undef VERSION
+#define VERSION	PACKAGE_NAME " " PACKAGE_VERSION " " PACKAGE_COPYRIGHT
+
+typedef uint8_t  byte;
+typedef uint16_t word;
+
+static char *program= 0;
+
+static M6502_Mode mode= M6502_ModeHybrid;
+static int max_insns= 0; /* default */
+
+static byte bank[0x10][0x4000];
+
+static uint64_t system_time_base;
+
+
+void fail(const char *fmt, ...)
+{
+  va_list ap;
+  fflush(stdout);
+  va_start(ap, fmt);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+  exit(1);
+}
+
+
+void pfail(const char *msg)
+{
+  fflush(stdout);
+  perror(msg);
+  exit(1);
+}
+
+
+#define rts							\
+  {								\
+    word pc;							\
+    pc  = mpu->memory[++mpu->registers->s + 0x100];		\
+    pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;	\
+    return pc + 1;						\
+  }
+
+
+uint64_t pseudo_system_time(void)
+{
+  struct timespec t;
+  if (clock_gettime(CLOCK_MONOTONIC, &t) == -1)
+  {
+    pfail("clock_gettime() failed");
+  }
+  long hsec= t.tv_nsec / 10000000;
+  return (((uint64_t) t.tv_sec) * 100) + hsec;
+}
+
+int osword(M6502 *mpu, word address, byte data)
+{
+  byte *params= mpu->memory + mpu->registers->x + (mpu->registers->y << 8);
+
+  switch (mpu->registers->a)
+    {
+    case 0x00: /* input line */
+      /* On entry: XY+0,1=>string area,
+       *	   XY+2=maximum line length,
+       *	   XY+3=minimum acceptable ASCII value,
+       *	   XY+4=maximum acceptable ASCII value.
+       * On exit:  Y is the line length (excluding CR),
+       *	   C is set if Escape terminated input.
+       */
+      {
+	word  offset= params[0] + (params[1] << 8);
+	byte *buffer= mpu->memory + offset;
+	byte  length= params[2], minVal= params[3], maxVal= params[4], b= 0;
+	if (!fgets((char *) buffer, length, stdin))
+	  {
+	    putchar('\n');
+	    exit(0);
+	  }
+	for (b= 0;  b < length;  ++b)
+	  if ((buffer[b] < minVal) || (buffer[b] > maxVal) || ('\n' == buffer[b]))
+	    break;
+	buffer[b]= 13;
+	mpu->registers->y= b;
+	mpu->registers->p &= 0xFE;
+	break;
+      }
+
+    case 0x01: /* read system time */
+      /* On exit: XY+0..4=>5 byte time in hundredths of a second
+       */
+      {
+	uint64_t system_time= pseudo_system_time() - system_time_base;
+	int i;
+	for (i= 0;  i < 5;  ++i)
+	{
+	  params[i]= system_time & 0xFF;
+	  system_time>>= 8;
+	}
+	break;
+      }
+
+    case 0x05: /* read I/O processor memory */
+      /* On entry: XY+0..3=>address to read from
+       * On exit:  XY+4   =>the byte read
+       */
+    {
+	word addr= params[0] + (params[1] << 8);
+	params[4]= mpu->memory[addr];
+	break;
+    }
+
+    default:
+      {
+	char state[64];
+	M6502_dump(mpu, state);
+	fflush(stdout);
+	fprintf(stderr, "\nOSWORD %s\n", state);
+	fail("ABORT");
+      }
+      break;
+    }
+  
+  rts;
+}
+
+
+int osbyte(M6502 *mpu, word address, byte data)
+{
+  switch (mpu->registers->a)
+    {
+    case 0x7A:	/* perform keyboard scan */
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x7E:	/* acknowledge detection of escape condition */
+      return 1;
+      break;
+
+    case 0x82:	/* read machine higher order address */
+      mpu->registers->y= 0x00;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x83:	/* read top of OS ram address (OSHWM) */
+      mpu->registers->y= 0x0E;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x84:	/* read bottom of display ram address */
+      mpu->registers->y= 0x80;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x89:	/* motor control */
+      break;
+
+    case 0xDA:	/* read/write number of items in vdu queue (stored at 0x026A) */
+      return 0;
+      break;
+
+    default:
+      {
+	char state[64];
+	M6502_dump(mpu, state);
+	fflush(stdout);
+	fprintf(stderr, "\nOSBYTE %s\n", state);
+	fail("ABORT");
+      }
+      break;
+    }
+
+  rts;
+}
+
+
+int oscli(M6502 *mpu, word address, byte data)
+{
+  byte *params= mpu->memory + mpu->registers->x + (mpu->registers->y << 8);
+  char  command[1024], *ptr= command;
+  int   ret;
+  while (('*' == *params) || (' ' == *params))
+    ++params;
+  while (13 != *params)
+    *ptr++= *params++;
+  *ptr= '\0';
+  ret= system(command);
+  if ((ret == -1) || (WIFEXITED(ret) && (WEXITSTATUS(ret) == 127)))
+    {
+      fflush(stdout);
+      fprintf(stderr, "\nsystem() failed\n");
+    }
+  rts;
+}
+
+
+int oswrch(M6502 *mpu, word address, byte data)
+{
+  switch (mpu->registers->a)
+    {
+    case 0x0C:
+      fputs("\033[2J\033[H", stdout);
+      break;
+
+    default:
+      putchar(mpu->registers->a);
+      break;
+    }
+  fflush(stdout);
+  rts;
+}
+
+
+static int writeROM(M6502 *mpu, word address, byte value)
+{
+  return 0;
+}
+
+
+static int bankSelect(M6502 *mpu, word address, byte value)
+{
+  memcpy(mpu->memory + 0x8000, bank[value & 0x0F], 0x4000);
+  return 0;
+}
+
+
+static int doBtraps(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+
+  /* Acorn Model B ROM and memory-mapped IO */
+
+  for (addr= 0x8000;  addr <= 0xFBFF;  ++addr)  mpu->callbacks->write[addr]= writeROM;
+  for (addr= 0xFC00;  addr <= 0xFEFF;  ++addr)  mpu->memory[addr]= 0xFF;
+  for (addr= 0xFE30;  addr <= 0xFE33;  ++addr)  mpu->callbacks->write[addr]= bankSelect;
+  for (addr= 0xFE40;  addr <= 0xFE4F;  ++addr)  mpu->memory[addr]= 0x00;
+  for (addr= 0xFF00;  addr <= 0xFFFF;  ++addr)  mpu->callbacks->write[addr]= writeROM;
+
+  /* anything already loaded at 0x8000 appears in bank 0 */
+
+  memcpy(bank[0x00], mpu->memory + 0x8000, 0x4000);
+
+  /* fake a few interesting OS calls */
+
+# define trap(vec, addr, func)   mpu->callbacks->call[addr]= (func)
+  trap(0x020C, 0xFFF1, osword);
+  trap(0x020A, 0xFFF4, osbyte);
+//trap(0x0208, 0xFFF7, oscli );	/* enable this to send '*COMMAND's to system(3) :-) */
+  trap(0x020E, 0xFFEE, oswrch);
+  trap(0x020E, 0xE0A4, oswrch);	/* NVWRCH */
+#undef trap
+
+  system_time_base= pseudo_system_time();
+
+  return 0;
+}
+
+
+static void usage(int status)
+{
+  FILE *stream= status ? stderr : stdout;
+  fprintf(stream, VERSION"\n");
+  fprintf(stream, "please send bug reports to: %s\n", PACKAGE_BUGREPORT);
+  fprintf(stream, "\n");
+  fprintf(stream, "usage: %s [option ...]\n", program);
+  fprintf(stream, "       %s [option ...] -B [image ...]\n", program);
+  fprintf(stream, "  -B                -- minimal Acorn 'BBC Model B' compatibility\n");
+  fprintf(stream, "  -d addr last      -- dump memory between addr and last\n");
+  fprintf(stream, "  -G addr           -- emulate getchar(3) at addr\n");
+  fprintf(stream, "  -h                -- help (print this message)\n");
+  fprintf(stream, "  -I addr           -- set IRQ vector\n");
+  fprintf(stream, "  -l addr file      -- load file at addr\n");
+  fprintf(stream, "  -M addr           -- emulate memory-mapped stdio at addr\n");
+  fprintf(stream, "  -mc               -- use compiled emulation mode\n");
+  fprintf(stream, "  -mh               -- use hybrid emulation mode (default)\n");
+  fprintf(stream, "  -mi               -- use interpreted emulation mode\n");
+  fprintf(stream, "  -mx count         -- maximum instructions to JIT (-mc/-mh)\n");
+  fprintf(stream, "  -N addr           -- set NMI vector\n");
+  fprintf(stream, "  -P addr           -- emulate putchar(3) at addr\n");
+  fprintf(stream, "  -R addr           -- set RST vector\n");
+  fprintf(stream, "  -s addr last file -- save memory from addr to last in file\n");
+  fprintf(stream, "  -v                -- print version number then exit\n");
+  fprintf(stream, "  -X addr           -- terminate emulation if PC reaches addr\n");
+  fprintf(stream, "  -x                -- exit without further ado\n");
+  fprintf(stream, "  image             -- '-l 8000 image' in available ROM slot\n");
+  fprintf(stream, "\n");
+  fprintf(stream, "'last' can be an address (non-inclusive) or '+size' (in bytes)\n");
+  exit(status);
+}
+
+
+static int doHelp(int argc, char **argv, M6502 *mpu)
+{
+  usage(0);
+  return 0;
+}
+
+
+static int doVersion(int argc, char **argv, M6502 *mpu)
+{
+  puts(VERSION);
+  exit(0);
+  return 0;
+}
+
+
+static unsigned long htol(char *hex)
+{
+  char *end;
+  unsigned long l= strtol(hex, &end, 16);
+  if (*end) fail("bad hex number: %s", hex);
+  return l;
+}
+
+
+static int loadInterpreter(M6502 *mpu, word start, const char *path)
+{
+  FILE   *file= 0;
+  int     count= 0;
+  byte   *memory= mpu->memory + start;
+  size_t  max= 0x10000 - start;
+  int     c= 0;
+
+  if ((!(file= fopen(path, "r"))) || ('#' != fgetc(file)) || ('!' != fgetc(file)))
+    return 0;
+  while ((c= fgetc(file)) >= ' ')
+    ;
+  while ((count= fread(memory, 1, max, file)) > 0)
+    {
+      memory += count;
+      max -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int save(M6502 *mpu, word address, unsigned length, const char *path)
+{
+  FILE *file= 0;
+  int   count= 0;
+  if (!(file= fopen(path, "w")))
+    return 0;
+  while ((count= fwrite(mpu->memory + address, 1, length, file)))
+    {
+      address += count;
+      length -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int load(M6502 *mpu, word address, const char *path)
+{
+  FILE  *file= 0;
+  int    count= 0;
+  size_t max= 0x10000 - address;
+  if (!(file= fopen(path, "r")))
+    return 0;
+  while ((count= fread(mpu->memory + address, 1, max, file)) > 0)
+    {
+      address += count;
+      max -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int doLoadInterpreter(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 3) usage(1);
+  if (!loadInterpreter(mpu, htol(argv[1]), argv[2])) pfail(argv[2]);
+  return 2;
+}
+
+
+static int doLoad(int argc, char **argv, M6502 *mpu)	/* -l addr file */
+{
+  if (argc < 3) usage(1);
+  if (!load(mpu, htol(argv[1]), argv[2])) pfail(argv[2]);
+  return 2;
+}
+
+
+static int doSave(int argc, char **argv, M6502 *mpu)	/* -l addr size file */
+{
+  if (argc < 4) usage(1);
+  if (!save(mpu, htol(argv[1]), htol(argv[2]), argv[3])) pfail(argv[3]);
+  return 3;
+}
+
+
+static int doMode(M6502_Mode m)
+{
+  mode= m;
+  return 0;
+}
+
+
+static int doMaxInsns(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 2) usage(1);
+  char *end;
+  unsigned long l= strtol(argv[1], &end, 10);
+  if (*end) fail("bad number: %s", argv[1]);
+  max_insns= l;
+  return 1;
+}
+
+
+#define doVEC(VEC)					\
+  static int do##VEC(int argc, char **argv, M6502 *mpu)	\
+    {							\
+      unsigned addr= 0;					\
+      if (argc < 2) usage(1);				\
+      addr= htol(argv[1]);				\
+      M6502_setVector(mpu, VEC, addr);			\
+      return 1;						\
+    }
+
+doVEC(IRQ);
+doVEC(NMI);
+doVEC(RST);
+
+#undef doVEC
+
+
+static int gTrap(M6502 *mpu, word addr, byte data)	{ mpu->registers->a= getchar();  rts; }
+static int pTrap(M6502 *mpu, word addr, byte data)	{ putchar(mpu->registers->a);  rts; }
+
+static int doGtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, gTrap);
+  return 1;
+}
+
+static int doPtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, pTrap);
+  return 1;
+}
+
+
+static int mTrapRead(M6502 *mpu, word addr, byte data)	{ return getchar(); }
+static int mTrapWrite(M6502 *mpu, word addr, byte data)	{ return putchar(data); }
+
+static int doMtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, read,  addr, mTrapRead);
+  M6502_setCallback(mpu, write, addr, mTrapWrite);
+  return 1;
+}
+
+
+static int xTrap(M6502 *mpu, word addr, byte data)	{ exit(0);  return 0; }
+
+static int doXtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, xTrap);
+  return 1;
+}
+
+
+static int doDisassemble(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0, last= 0;
+  if (argc < 3) usage(1);
+  addr= htol(argv[1]);
+  last= ('+' == *argv[2]) ? addr + htol(1 + argv[2]) : htol(argv[2]);
+  while (addr < last)
+    {
+      char insn[64];
+      int  i= 0, size= M6502_disassemble(mpu, addr, insn);
+      printf("%04X ", addr);
+      while (i++ < size)  printf("%02X", mpu->memory[addr + i - 1]);
+      while (i++ < 4)     printf("  ");
+      putchar(' ');
+      i= 0;
+      while (i++ < size)  putchar(isgraph(mpu->memory[addr + i - 1]) ? mpu->memory[addr + i - 1] : ' ');
+      while (i++ < 4)     putchar(' ');
+      printf(" %s\n", insn);
+      addr += size;
+    }
+  return 2;
+}
+
+
+int main(int argc, char **argv)
+{
+  M6502 *mpu= M6502_new(0, 0, 0);
+  int bTraps= 0;
+
+  program= argv[0];
+
+  if ((2 == argc) && ('-' != *argv[1]))
+    {
+      if ((!loadInterpreter(mpu, 0, argv[1])) && (!load(mpu, 0, argv[1])))
+	pfail(argv[1]);
+      doBtraps(0, 0, mpu);
+    }
+  else
+    while (++argv, --argc > 0)
+      {
+	int n= 0;
+	if      (!strcmp(*argv, "-B"))  bTraps= 1;
+	else if (!strcmp(*argv, "-d"))	n= doDisassemble(argc, argv, mpu);
+	else if (!strcmp(*argv, "-G"))	n= doGtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-h"))	n= doHelp(argc, argv, mpu);
+	else if (!strcmp(*argv, "-i"))	n= doLoadInterpreter(argc, argv, mpu);
+	else if (!strcmp(*argv, "-I"))	n= doIRQ(argc, argv, mpu);
+	else if (!strcmp(*argv, "-l"))	n= doLoad(argc, argv, mpu);
+	else if (!strcmp(*argv, "-M"))	n= doMtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-mc")) n= doMode(M6502_ModeCompiled);
+	else if (!strcmp(*argv, "-mh")) n= doMode(M6502_ModeHybrid);
+	else if (!strcmp(*argv, "-mi")) n= doMode(M6502_ModeInterpreted);
+	else if (!strcmp(*argv, "-mx")) n= doMaxInsns(argc, argv, mpu);
+	else if (!strcmp(*argv, "-N"))	n= doNMI(argc, argv, mpu);
+	else if (!strcmp(*argv, "-P"))	n= doPtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-R"))	n= doRST(argc, argv, mpu);
+	else if (!strcmp(*argv, "-s"))	n= doSave(argc, argv, mpu);
+	else if (!strcmp(*argv, "-v"))	n= doVersion(argc, argv, mpu);
+	else if (!strcmp(*argv, "-X"))	n= doXtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-x"))	exit(0);
+	else if ('-' == **argv)		usage(1);
+	else
+	  {
+	    /* doBtraps() left 0x8000+0x4000 in bank 0, so load */
+	    /* additional images starting at 15 and work down */
+	    static int bankSel= 0x0F;
+	    if (!bTraps)			usage(1);
+	    if (bankSel < 0)			fail("too many images");
+	    if (!load(mpu, 0x8000, argv[0]))	pfail(argv[0]);
+	    memcpy(bank[bankSel--],
+		   0x8000 + mpu->memory,
+		   0x4000);
+	    n= 0;
+	  }
+	argc -= n;
+	argv += n;
+      }
+
+  M6502_setMode(mpu, mode, max_insns);
+
+  if (bTraps)
+    doBtraps(0, 0, mpu);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);
+
+  return 0;
+}
diff --git a/test/addr-wrap-1.mst b/test/addr-wrap-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/addr-wrap-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/addr-wrap-1.xa b/test/addr-wrap-1.xa
new file mode 100644
index 0000000..a49e9d4
--- /dev/null
+++ b/test/addr-wrap-1.xa
@@ -0,0 +1,25 @@
+#include "config.xa"
+
+	LDA #1
+	STA $00
+	STA $05
+	STA $0A
+	LDY #$80
+	CLC
+	LDA #0
+LOOP
+	ADC $FF80,Y
+	INY
+	BNE LOOP
+	CMP #3
+	BNE FAIL
+
+SUCCESS
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/basic-callback.c b/test/basic-callback.c
new file mode 100644
index 0000000..d2ffb27
--- /dev/null
+++ b/test/basic-callback.c
@@ -0,0 +1,122 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int rd(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nrd: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int wr(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nwr: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,                     0, done);
+  M6502_setCallback(mpu, call,                0x2000, call);
+  M6502_setCallback(mpu, call,                0x3000, call);
+  M6502_setCallback(mpu, call,                0x4000, call);
+  M6502_setCallback(mpu, read,                0x5000, rd  );
+  M6502_setCallback(mpu, write,               0x5000, wr  );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x44, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x5c, ill );
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x13          );
+  gen1(0x44          );
+  gen1(0x13          ); // not executed, 0x44 is a two-byte illegal instruction
+  gen1(0x5C          );
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0xad,0x00,0x50); // LDA &5000
+  gen2(0x64,0x70     ); // STZ &70
+  gen2(0xa9,0x50     ); // LDA #&50
+  gen2(0x85,0x71     ); // STA &71
+  gen2(0xb2,0x70     ); // LDA (&70)
+  gen2(0x92,0x70     ); // STA (&70)
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen3(0x8d,0x00,0x50); // STA &5000
+  gen3(0x4c,0x00,0x30); // JMP &3000
+
+  pc = 0x3000;
+  gen2(0xa9,0x00     ); // LDA #0
+  gen3(0x8d,0x76,0x32); // STA &3276
+  gen2(0xa9,0x40     ); // LDA #&40
+  gen3(0x8d,0x77,0x32); // STA &3277
+  gen3(0x6c,0x76,0x32); // JMP (&3276)
+
+  pc = 0x4000;
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/basic-callback.mst b/test/basic-callback.mst
new file mode 100644
index 0000000..2c713d3
--- /dev/null
+++ b/test/basic-callback.mst
@@ -0,0 +1,33 @@
+
+ill: address 1000 opcode 13 memory 13
+PC=1001 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+ill: address 1001 opcode 44 memory 44
+PC=1003 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+ill: address 1003 opcode 5C memory 5C
+PC=1006 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+call: address 2000 opcode 20
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+wr: address 5000 opcode 00
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+call: address 3000 opcode 4C
+PC=3000 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+call: address 4000 opcode 6C
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+rd: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+rd: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+wr: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+BRK instruction: address 1016 opcode 00
+PC=1018 SP=01FD A=00 X=00 Y=00 P=06 -----IZ-
diff --git a/test/call-illegal-callback-modify-code.c b/test/call-illegal-callback-modify-code.c
new file mode 100644
index 0000000..bf5ec76
--- /dev/null
+++ b/test/call-illegal-callback-modify-code.c
@@ -0,0 +1,121 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+static uint16_t call_modify1_addr;
+static uint16_t call_modify2_addr;
+static uint16_t ill_modify1_addr;
+static uint16_t ill_modify2_addr;
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  mpu->memory[call_modify1_addr] += 1;
+  mpu->memory[call_modify2_addr] += 2;
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  mpu->memory[ill_modify1_addr] += 1;
+  mpu->memory[ill_modify2_addr] += 2;
+  return 0;
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,                     0, done  );
+  M6502_setCallback(mpu, call,                0x2000, call  );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill   );
+  M6502_setCallback(mpu, call,                0xffee, oswrch);
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen1(0x13          ); // ill &13
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen1(0x13          ); // ill &13
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen1(0x60          ); // RTS
+
+  pc = 0x3000;
+  gen2(0xa9,'C'      ); // LDA #'C'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  call_modify1_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  call_modify2_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  ill_modify1_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  ill_modify2_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  gen2(0xa9,'\n'     ); // LDA #'\n'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/call-illegal-callback-modify-code.mst b/test/call-illegal-callback-modify-code.mst
new file mode 100644
index 0000000..cc5acff
--- /dev/null
+++ b/test/call-illegal-callback-modify-code.mst
@@ -0,0 +1,20 @@
+CAAAA
+
+ill: address 1003 opcode 13 memory 13
+PC=1004 SP=0100 A=0A X=00 Y=00 P=04 -----I--
+CAABC
+
+ill: address 1007 opcode 13 memory 13
+PC=1008 SP=0100 A=0A X=00 Y=00 P=04 -----I--
+CAACE
+
+call: address 2000 opcode 20
+PC=100E SP=01FE A=0A X=00 Y=00 P=04 -----I--
+CBCCE
+
+call: address 2000 opcode 20
+PC=1014 SP=01FE A=0A X=00 Y=00 P=04 -----I--
+CCECE
+
+BRK instruction: address 1017 opcode 00
+PC=1019 SP=01FD A=0A X=00 Y=00 P=04 -----I--
diff --git a/test/config.xa b/test/config.xa
new file mode 100644
index 0000000..a7e0560
--- /dev/null
+++ b/test/config.xa
@@ -0,0 +1,4 @@
+OSWRCH = $FFEE
+QUIT = $F000
+
+*= $1E00
diff --git a/test/interleave.mst b/test/interleave.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/interleave.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/interleave.xa b/test/interleave.xa
new file mode 100644
index 0000000..8fb5ee0
--- /dev/null
+++ b/test/interleave.xa
@@ -0,0 +1,38 @@
+#include "config.xa"
+
+	JSR SETX10
+	CPX #10
+	BNE FAIL
+	JSR SETX30
+	CPX #30
+	BNE FAIL
+	JSR SETX20
+	CPX #20
+	BNE FAIL
+	JSR SETX30
+	CPX #30
+	BNE FAIL
+	JSR SETX10
+	CPX #10
+	BNE FAIL
+	JSR SETX20
+	CPX #20
+	BNE FAIL
+
+SUCCESS
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
+
+; example taken from http://www.6502.org/tutorials/6502opcodes.html
+SETX10	LDX #10
+	.byte $2C
+SETX20	LDX #20
+	.byte $2C
+SETX30	LDX #30
+	RTS
diff --git a/test/irq-nmi.c b/test/irq-nmi.c
new file mode 100644
index 0000000..ae95352
--- /dev/null
+++ b/test/irq-nmi.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int brk(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  if (data == 0x03)
+  {
+    M6502_nmi(mpu);
+  }
+  else if (data == 0x13)
+  {
+    M6502_irq(mpu);
+  } 
+
+  return 0;
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  /* 0x3000 is the IRQ/BRK vector, but call callbacks don't trigger on
+   * interrupts, so this is only called on BRK.
+   */
+  M6502_setCallback(mpu, call,                0x3000, brk   );
+
+  M6502_setCallback(mpu, illegal_instruction,   0x03, ill   );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill   );
+  M6502_setCallback(mpu, call,                0xffee, oswrch);
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x58          ); // CLI
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x03          ); // NMI
+  gen2(0xa9,'B'      ); // LDA #'B'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x13          ); // IRQ
+  gen2(0xa9,'C'      ); // LDA #'C'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x78          ); // SEI
+  gen1(0x13          ); // IRQ (ignored)
+  gen1(0x03          ); // NMI
+  gen1(0x13          ); // IRQ (ignored)
+  gen2(0xa9,'D'      ); // LDA #'D'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x58          ); // CLI
+  gen1(0x13          ); // IRQ
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen2(0xa9,'N'      ); // LDA #'N'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x40          ); // RTI
+
+  pc = 0x3000;
+  gen2(0xa9,'I'      ); // LDA #'I'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x40          ); // RTI
+
+  M6502_setVector(mpu, RST, 0x1000);
+  M6502_setVector(mpu, NMI, 0x2000);
+  M6502_setVector(mpu, IRQ, 0x3000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/irq-nmi.mst b/test/irq-nmi.mst
new file mode 100644
index 0000000..bf7d32b
--- /dev/null
+++ b/test/irq-nmi.mst
@@ -0,0 +1,21 @@
+A
+ill: address 1006 opcode 03 memory 03
+PC=1007 SP=0100 A=41 X=00 Y=00 P=00 --------
+NB
+ill: address 100C opcode 13 memory 13
+PC=100D SP=0100 A=42 X=00 Y=00 P=00 --------
+IC
+ill: address 1013 opcode 13 memory 13
+PC=1014 SP=0100 A=43 X=00 Y=00 P=04 -----I--
+
+ill: address 1014 opcode 03 memory 03
+PC=1015 SP=0100 A=43 X=00 Y=00 P=04 -----I--
+N
+ill: address 1015 opcode 13 memory 13
+PC=1016 SP=0100 A=4E X=00 Y=00 P=04 -----I--
+D
+ill: address 101C opcode 13 memory 13
+PC=101D SP=0100 A=44 X=00 Y=00 P=00 --------
+I
+BRK: address 101D opcode 00
+PC=101F SP=01FD A=49 X=00 Y=00 P=04 -----I--
diff --git a/test/pc-wrap-1.mst b/test/pc-wrap-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/pc-wrap-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/pc-wrap-1.xa b/test/pc-wrap-1.xa
new file mode 100644
index 0000000..c703803
--- /dev/null
+++ b/test/pc-wrap-1.xa
@@ -0,0 +1,28 @@
+#include "config.xa"
+
+; It's not important this is self-modifying code, this is just the easiest way
+; to get code at the relevant addresses without fighting with the assembler and
+; the fact run6502 will clobber the top of memory to set up various vectors.
+
+	LDA #$A9 ; LDA #n
+	STA $FFFE
+	STA $00
+	LDA #'N'
+	STA $FFFF
+	LDA #'Y'
+	STA $01
+
+	LDA #$20 ; JSR abs
+	STA $02
+	LDA #$EE
+	STA $03
+	LDA #$FF
+	STA $04
+	LDA #$4C ; JMP abs
+	STA $05
+	LDA #<QUIT
+	STA $06
+	LDA #>QUIT
+	STA $07
+
+	JMP $FFFE
diff --git a/test/pc-wrap-2.mst b/test/pc-wrap-2.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/pc-wrap-2.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/pc-wrap-2.xa b/test/pc-wrap-2.xa
new file mode 100644
index 0000000..c70763f
--- /dev/null
+++ b/test/pc-wrap-2.xa
@@ -0,0 +1,28 @@
+#include "config.xa"
+
+; It's not important this is self-modifying code, this is just the easiest way
+; to get code at the relevant addresses without fighting with the assembler and
+; the fact run6502 will clobber the top of memory to set up various vectors.
+
+	LDA #$A9 ; LDA #n
+	STA $FFFD
+	STA $FFFF
+	LDA #'N'
+	STA $FFFE
+	LDA #'Y'
+	STA $00
+
+	LDA #$20 ; JSR abs
+	STA $01
+	LDA #$EE
+	STA $02
+	LDA #$FF
+	STA $03
+	LDA #$4C ; JMP abs
+	STA $04
+	LDA #<QUIT
+	STA $05
+	LDA #>QUIT
+	STA $06
+
+	JMP $FFFD
diff --git a/test/run-c-tests.py b/test/run-c-tests.py
new file mode 100755
index 0000000..b4a628e
--- /dev/null
+++ b/test/run-c-tests.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+import subprocess
+
+tests = [
+    'basic-callback',
+    'call-illegal-callback-modify-code',
+    'irq-nmi',
+    'setjmp-trick',
+    'stack-code-brk',
+    'stack-code-jsr',
+    'write-callback-modify-code'
+]
+
+test_args = [
+    '-mi',
+    '-mh',
+    '-mc -mx 1',
+    '-mc'
+]
+        
+print('1..', len(tests) * len(test_args), sep='')
+i = 1
+for test_arg in test_args:
+    for test in tests:
+        result = subprocess.check_output(['test/' + test] + test_arg.split())
+        expected_result = open('test/' + test + '.mst', 'rb').read()
+        if result == expected_result:
+            print('ok', i, test, test_arg)
+        else:
+            print('not ok', i, test, test_arg)
+        i += 1
diff --git a/test/run-c-tests.sh b/test/run-c-tests.sh
new file mode 100755
index 0000000..7c60f3c
--- /dev/null
+++ b/test/run-c-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+python test/run-c-tests.py
diff --git a/test/run-run6502-tests.py b/test/run-run6502-tests.py
new file mode 100755
index 0000000..378989e
--- /dev/null
+++ b/test/run-run6502-tests.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+import glob
+import os
+import subprocess
+
+os.chdir('test')
+
+# It's quite likely the "xa" assembler is not installed; don't generate
+# scary test failures if that's the case.
+xa_installed = True
+try:
+    result = subprocess.check_output(['xa', '--version'])
+    if result.find(b'xa65') == -1:
+        xa_installed = False
+except:
+    xa_installed = False
+
+# By default we skip slow tests (those with names starting z-) in '-mc'
+# modes.
+skip_slow_mc = (os.getenv('RUN_SLOW_TESTS', '0') == '0')
+
+# Since we didn't have to hard-code the test names in the Makefile.am, we
+# use wildcards here.
+tests = sorted([t for t in glob.glob('*.xa') if t != 'config.xa'])
+
+test_args = [
+    '-mi',
+    '-mh',
+    '-mc -mx 1',
+    '-mc'
+]
+        
+print('1..', len(tests) * len(test_args), sep='')
+i = 0
+for test_arg in test_args:
+    for test in tests:
+        i += 1
+        basename = test[0:-3]
+
+        if not xa_installed:
+            print('ok', i, '# skipped (xa not installed):', test, test_arg)
+            continue
+
+        if skip_slow_mc and basename[0:2] == 'z-' and test_arg[0:3] == '-mc':
+            print('ok', i, '# skipped (slow -mc):', test, test_arg)
+            continue
+
+        xa_out = basename + '.mc'
+        subprocess.check_call(['xa', '-o', xa_out, test])
+        result = subprocess.check_output(
+            ['../run6502', '-l', '1e00', xa_out, '-R', '1e00', '-G', 'ffe0', 
+             '-P', 'ffee', '-X', 'f000'] + test_arg.split())
+        expected_result = open(basename + '.mst', 'rb').read()
+        if result == expected_result:
+            print('ok', i, test, test_arg)
+        else:
+            print('not ok', i, test, test_arg)
diff --git a/test/run-run6502-tests.sh b/test/run-run6502-tests.sh
new file mode 100755
index 0000000..c0e21dd
--- /dev/null
+++ b/test/run-run6502-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+python test/run-run6502-tests.py
diff --git a/test/setjmp-trick.c b/test/setjmp-trick.c
new file mode 100644
index 0000000..f363d2e
--- /dev/null
+++ b/test/setjmp-trick.c
@@ -0,0 +1,125 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+static jmp_buf env;
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  longjmp(env, 1);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  mpu->registers->pc = address;
+  longjmp(env, 2);
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  longjmp(env, 3);
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  /* Read and write callbacks don't provide the correct, up-to-date CPU state
+   * in the M6502 object, so this trick is a non-starter with them.
+   */
+
+  M6502_setCallback(mpu, call,                     0, done);
+  M6502_setCallback(mpu, call,                0x2000, call);
+  M6502_setCallback(mpu, call,                0x3000, call);
+  M6502_setCallback(mpu, call,                0x4000, call);
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x44, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x5c, ill );
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x13          );
+  gen1(0x44          );
+  gen1(0x13          ); // not executed, 0x44 is a two-byte illegal instruction
+  gen1(0x5C          );
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0xad,0x00,0x50); // LDA &5000
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen3(0x8d,0x00,0x50); // STA &5000
+  gen3(0x4c,0x00,0x30); // JMP &3000
+
+  pc = 0x3000;
+  gen2(0xa9,0x00     ); // LDA #0
+  gen3(0x8d,0x76,0x32); // STA &3276
+  gen2(0xa9,0x40     ); // LDA #&40
+  gen3(0x8d,0x77,0x32); // STA &3277
+  gen3(0x6c,0x76,0x32); // JMP (&3276)
+
+  pc = 0x4000;
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  while (1)
+  {
+    volatile int result = setjmp(env);
+    if (result == 0)
+    {
+    	M6502_run(mpu);
+    }
+    else
+    {
+      printf("\nsetjmp() returned %d\n", result);
+      if (result == 1)
+      {
+	break;
+      }
+    }
+  }
+  M6502_delete(mpu);
+
+  return 0;
+}
diff --git a/test/setjmp-trick.mst b/test/setjmp-trick.mst
new file mode 100644
index 0000000..ac0bcd5
--- /dev/null
+++ b/test/setjmp-trick.mst
@@ -0,0 +1,35 @@
+
+ill: address 1000 opcode 13 memory 13
+PC=1001 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+ill: address 1001 opcode 44 memory 44
+PC=1003 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+ill: address 1003 opcode 5C memory 5C
+PC=1006 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+call: address 2000 opcode 20
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+call: address 3000 opcode 4C
+PC=3000 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+call: address 4000 opcode 6C
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+BRK instruction: address 100C opcode 00
+PC=100E SP=01FD A=00 X=00 Y=00 P=06 -----IZ-
+
+setjmp() returned 1
diff --git a/test/stack-code-brk.c b/test/stack-code-brk.c
new file mode 100644
index 0000000..8ac2b75
--- /dev/null
+++ b/test/stack-code-brk.c
@@ -0,0 +1,108 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+  unsigned saved_pc;
+
+  M6502_setCallback(mpu, call,  0xf000, done  );
+  M6502_setCallback(mpu, call,  0xffee, oswrch);
+
+  gen2(0xa2, 0xff      ); // LDX #&FF
+  gen1(0x9a            ); // TXS
+  gen2(0xa9, 'A'       ); // LDA #'A'
+
+  // LDA #'B' is 0xa9, 0x42. So if we execute a BRK at 0x42a7, it will
+  // push 0x42, 0xa9 and the flags onto the stack. Since the stack grows
+  // downwards those bytes will be in the right order for execution. We'll
+  // additionally push an LDX immediate opcode so we can "execute" the flags
+  // value. We can nearly force the flags to be whatever we like using PLP,
+  // although the BRK will set the B and X bits in the stacked value. We
+  // demonstrate this by explicitly masking off those bits in the values we
+  // force into the flags.
+  enum {
+    flagX= (1<<5),	/* unused   	 */
+    flagB= (1<<4) 	/* irq from brk  */
+  };
+  uint8_t mask = ~(flagX | flagB);
+  gen2(0xa0, '0' & mask); // LDY #('0' with B/X masked off)
+  gen1(0x5a            ); // PHY
+  gen1(0x28            ); // PLP
+  gen3(0x4c, 0xa7, 0x42); // JMP &42A7
+  pc = 0x42a7;
+  gen2(0x00, 0x00      ); // BRK
+  saved_pc = pc;
+  pc = 0x0; // BRK vector
+  gen2(0xa9, 0xa2      ); // LDA #<LDX # opcode>
+  gen1(0x48            ); // PHA
+  gen3(0x4c, 0xfc, 0x01); // JMP &01FC
+  pc = 0x200;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x8a            ); // TXA
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x68            ); // PLA
+  gen1(0x40            ); // RTI
+  pc = saved_pc;
+
+  // Let's do the same thing again, but this time code has already been
+  // executed from that address on the stack, so we're verifying the change
+  // is picked up. We do LDA #'C' this time, so we execute the BRK from
+  // 0x43a7.
+  gen2(0xa0, '1' & mask); // LDY #('1' with B/X masked off)
+  gen1(0x5a            ); // PHY
+  gen1(0x28            ); // PLP
+  gen3(0x4c, 0xa7, 0x43); // JMP &43A7
+  pc = 0x43a7;
+  gen2(0x00, 0x00      ); // BRK
+
+  gen3(0x4c, 0x00, 0xf0); // JMP &F000 (quit)
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/stack-code-brk.mst b/test/stack-code-brk.mst
new file mode 100644
index 0000000..467dbb8
--- /dev/null
+++ b/test/stack-code-brk.mst
@@ -0,0 +1 @@
+B0C1
\ No newline at end of file
diff --git a/test/stack-code-jsr.c b/test/stack-code-jsr.c
new file mode 100644
index 0000000..5cac6bf
--- /dev/null
+++ b/test/stack-code-jsr.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+  unsigned saved_pc;
+
+  M6502_setCallback(mpu, call,       0, done  );
+  M6502_setCallback(mpu, call,  0xffee, oswrch);
+
+  gen2(0xa2, 0xff      ); // LDX #&FF
+  gen1(0x9a            ); // TXS
+  gen2(0xa9, 'A'       ); // LDA #'A'
+
+  // LDA #'B' is 0xa9, 0x42. So if we execute a JSR at 0x42a7, it will
+  // push 0x42 and then 0xa9 onto the stack. Since the stack grows downwards
+  // those bytes will be in the right order for execution.
+  gen3(0x4c, 0xa7, 0x42); // JMP &42A7
+  pc = 0x42a7;
+  gen3(0x20, 0x00, 0x30); // JSR &3000
+  saved_pc = pc;
+  pc = 0x3000;
+  gen3(0x4c, 0xfe, 0x01); // JMP &01FE
+  pc = 0x200;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x60            ); // RTS
+  pc = saved_pc;
+
+  // Let's do the same thing again, but this time code has already been
+  // executed from that address on the stack, so we're verifying the change
+  // is picked up. We do LDA #'C' this time, so we execute the JSR from
+  // 0x43a7.
+  gen3(0x4c, 0xa7, 0x43); // JMP &43A7
+  pc = 0x43a7;
+  gen3(0x20, 0x00, 0x30); // JSR &3000
+
+  gen2(0x00, 0x00      ); // BRK
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/stack-code-jsr.mst b/test/stack-code-jsr.mst
new file mode 100644
index 0000000..62ee1a3
--- /dev/null
+++ b/test/stack-code-jsr.mst
@@ -0,0 +1,3 @@
+BC
+BRK instruction: address 43AA opcode 00
+PC=43AC SP=01FC A=43 X=FF Y=00 P=04 -----I--
diff --git a/test/test-utils.c b/test/test-utils.c
new file mode 100644
index 0000000..b17819c
--- /dev/null
+++ b/test/test-utils.c
@@ -0,0 +1,106 @@
+/* parse-args.c -- utility function for C test programs */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* Some of this code is copy-and-pasted from run6502.c, but there's not enough
+ * of it for me to want to complicate things even slightly by trying to share
+ * it, especially since this is test code and somewhat distinct. 
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lib6502.h"
+
+static const char *program= 0;
+static M6502_Mode mode= M6502_ModeHybrid;
+static int max_insns= 0; /* default */
+
+enum {
+  flagX= (1<<5),	/* unused   	 */
+  flagB= (1<<4) 	/* irq from brk  */
+};
+
+void fail(const char *fmt, ...)
+{
+  va_list ap;
+  fflush(stdout);
+  va_start(ap, fmt);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+  exit(1);
+}
+
+static void usage(int status)
+{
+  FILE *stream = stderr;
+  fprintf(stream, "usage: %s [option ...]\n", program);
+  fprintf(stream, "  -h        -- help (print this message)\n");
+  fprintf(stream, "  -mc       -- use compiled emulation mode\n");
+  fprintf(stream, "  -mh       -- use hybrid emulation mode (default)\n");
+  fprintf(stream, "  -mi       -- use interpreted emulation mode\n");
+  fprintf(stream, "  -mx count -- maximum instructions to JIT (-mc/-mh)\n");
+  exit(status);
+}
+
+static int doMode(M6502_Mode m)
+{
+  mode= m;
+  return 0;
+}
+
+static int doMaxInsns(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 2) usage(1);
+  char *end;
+  unsigned long l= strtol(argv[1], &end, 10);
+  if (*end) fail("bad number: %s", argv[1]);
+  max_insns= l;
+  return 1;
+}
+
+void parse_args(int argc, char *argv[], M6502 *mpu)
+{
+    program= argv[0];
+    while (++argv, --argc > 0)
+    {
+	int n= 0;
+	if      (!strcmp(*argv, "-h"))  usage(0);
+	else if (!strcmp(*argv, "-mc")) n= doMode(M6502_ModeCompiled);
+	else if (!strcmp(*argv, "-mh")) n= doMode(M6502_ModeHybrid);
+	else if (!strcmp(*argv, "-mi")) n= doMode(M6502_ModeInterpreted);
+	else if (!strcmp(*argv, "-mx")) n= doMaxInsns(argc, argv, mpu);
+	else				usage(1);
+	argc -= n;
+	argv += n;
+    }
+
+    M6502_setMode(mpu, mode, max_insns);
+}
+
+void M6502_dump_masked(M6502 *mpu, char buffer[64])
+{
+    uint8_t orig_p = mpu->registers->p;
+    mpu->registers->p &= ~(flagB | flagX);
+    M6502_dump(mpu, buffer);
+    mpu->registers->p = orig_p;
+}
diff --git a/test/test-utils.h b/test/test-utils.h
new file mode 100644
index 0000000..5b15dd7
--- /dev/null
+++ b/test/test-utils.h
@@ -0,0 +1,30 @@
+/* test-utils.h -- utility functions for C test programs */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef PARSEARGS_H
+#define PARSEARGS_H
+
+#include "lib6502.h"
+
+void parse_args(int argc, char *argv[], M6502 *mpu);
+
+void M6502_dump_masked(M6502 *mpu, char buffer[64]);
+
+#endif
diff --git a/test/trivial-test.mst b/test/trivial-test.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/trivial-test.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/trivial-test.xa b/test/trivial-test.xa
new file mode 100644
index 0000000..1448a22
--- /dev/null
+++ b/test/trivial-test.xa
@@ -0,0 +1,5 @@
+#include "config.xa"
+
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/write-callback-modify-code.c b/test/write-callback-modify-code.c
new file mode 100644
index 0000000..cb35317
--- /dev/null
+++ b/test/write-callback-modify-code.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int wr(M6502 *mpu, uint16_t address, uint8_t data)
+{
+    if (address != 0x42)
+    {
+    	abort();
+    }
+
+    unsigned pc = 0x6000;
+    gen2(0xa9, data);       // LDA #data
+    gen3(0x4c, 0x00, 0x20); // JMP &2000
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,      0, done);
+  M6502_setCallback(mpu, call, 0xffee, oswrch);
+  M6502_setCallback(mpu, write,  0x42, wr  );
+
+  gen2(0xa9, '>'       ); // LDA #'>'
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen2(0xa2, 'A'       ); // LDX #'A'
+  gen3(0x8e, 0x42, 0x00); // STX &0042
+  gen3(0x20, 0x00, 0x60); // JSR &6000
+  gen1(0xe8            ); // INX
+  gen2(0xe0, 'Z'+1     ); // CPX #('Z'+1)
+  gen2(0x90, 0xf5      ); // BCC to STX
+
+  gen2(0xa0, 0x05      ); // LDY #&05
+  gen2(0xa9, '>'       ); // LDA #'>'
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen2(0xa2, 'A'       ); // LDX #'A'
+  gen2(0x96, 0x42-0x05 ); // STX (&42-&05),Y
+  gen3(0x20, 0x00, 0x60); // JSR &6000
+  gen1(0xe8            ); // INX
+  gen2(0xe0, 'Z'+1     ); // CPX #('Z'+1)
+  gen2(0x90, 0xf6      ); // BCC to STX
+
+  gen2(0x00, 0x00      ); // BRK
+
+  pc = 0x2000;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x60            ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/write-callback-modify-code.mst b/test/write-callback-modify-code.mst
new file mode 100644
index 0000000..65de187
--- /dev/null
+++ b/test/write-callback-modify-code.mst
@@ -0,0 +1,3 @@
+>ABCDEFGHIJKLMNOPQRSTUVWXYZ>ABCDEFGHIJKLMNOPQRSTUVWXYZ
+BRK instruction: address 1025 opcode 00
+PC=1027 SP=01FD A=5A X=5B Y=05 P=07 -----IZC
diff --git a/test/z-self-modify-1.mst b/test/z-self-modify-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/z-self-modify-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/z-self-modify-1.xa b/test/z-self-modify-1.xa
new file mode 100644
index 0000000..cdf31e2
--- /dev/null
+++ b/test/z-self-modify-1.xa
@@ -0,0 +1,94 @@
+; This test attempts to confirm that in hybrid mode, the JITted code is
+; discarded correctly if it's modified by the interpreter.
+
+#include "config.xa"
+
+COUNT1 = $71
+COUNT2 = $72
+COUNT3 = $73
+
+; We loop lots to get as much chance of a problem occurring as possible.
+	STZ COUNT1
+LOOP1
+	LDY #0
+LOOP2
+	LDX #0
+LOOP3
+
+; The heart of the test. We LDA #n, then CMP <address of n>. If the two don't
+; match we have a problem.
+LDAOP
+	LDA #3
+	CMP LDAOP+1
+	BNE FAIL
+
+; We now modify the LDA operand...
+	INC LDAOP+1
+
+; ... and occupy as much of the interpreter's time as possible while the JIT
+; thread picks up the modified version (if it's not working from the snapshot).
+; In reality we probably go round multiple times before the JIT completes.
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+
+; And round and round we go.
+	DEX
+	BNE LOOP3
+	DEY
+	BNE LOOP2
+	DEC COUNT1
+	BNE LOOP1
+
+OK
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/z-self-modify-2.mst b/test/z-self-modify-2.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/z-self-modify-2.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/z-self-modify-2.xa b/test/z-self-modify-2.xa
new file mode 100644
index 0000000..81d21c4
--- /dev/null
+++ b/test/z-self-modify-2.xa
@@ -0,0 +1,125 @@
+; This test attempts to confirm that as subtle potential bug in the hybrid JIT
+; implementation is not present.
+;
+; The potential problem is as follows:
+; - we decide to JIT some code
+; - we take a snapshot of memory
+; - we kick off a JIT thread which *works off the main memory array*, not the 
+;   snapshot
+; - in the meantime the interpreter executes some code which modifies the code
+;   being JITted before it is actually jitted.
+; - we JIT the modified version of the code
+; - the interpreter then executes some code which reverts the change (A)
+; - we decide to execute the JITted function. We check memory against the memory
+;   snapshot taken when we started JITting and find no differences in any 
+;   addresses which contain code, because of the previous step marked (A).
+; - boom, our JITted code is not doing what it should.
+;
+; The fix for this problem is simply to ensure that the JIT thread works off
+; the snapshot of memory taken when we launched the JIT thread. Note that even
+; if we fail to do this, self-modifying code which doesn't "undo" itself will
+; be noticed when we use the memory snapshot to decide if the JITted code is
+; still valid.
+;
+; This test case should execute correctly in all modes (of course), but in
+; hybrid mode it should *fail* if the implementation is temporarily changed to
+; JIT from mpu->memory and not memory_snapshot. At the time of writing it does.
+
+
+
+#include "config.xa"
+
+COUNT1 = $71
+COUNT2 = $72
+COUNT3 = $73
+
+; We loop lots to get as much chance of a problem occurring as possible.
+	STZ COUNT1
+LOOP1
+	LDY #0
+LOOP2
+	LDX #0
+LOOP3
+
+; The heart of the test. We LDA #n, then CMP <address of n>. If the two don't
+; match we have a problem.
+LDAOP
+	LDA #3
+	CMP LDAOP+1
+	BNE FAIL
+
+; We now modify the LDA operand...
+	INC LDAOP+1
+
+; ... and occupy as much of the interpreter's time as possible while the JIT
+; thread picks up the modified version (if it's not working from the snapshot).
+; In reality we probably go round multiple times before the JIT completes.
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+
+; We now put the operand back. Since we only switch from interpreting to JITting
+; on a control transfer, we know the transition will occur at a point when we've
+; put the operand back, which is helpful.
+	DEC LDAOP+1
+
+; And round and round we go.
+	DEX
+	BNE LOOP3
+	DEY
+	BNE LOOP2
+	DEC COUNT1
+	BNE LOOP1
+
+OK
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/util.cpp b/util.cpp
new file mode 100644
index 0000000..dbcecab
--- /dev/null
+++ b/util.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "util.h"
+
+#include <boost/thread/thread.hpp>
+#include <iostream>
+#include <stdio.h>
+
+boost::mutex log_mutex;
+
+void log(const std::string &s)
+{
+    boost::mutex::scoped_lock scoped_lock(log_mutex);
+    std::cerr << s << std::endl;
+}
+
+void die(const char *s)
+{
+  fflush(stdout);
+  fprintf(stderr, "\n%s\n", s);
+  abort();
+}
+
+std::string spaces(int n)
+{
+    return std::string(4 * n, ' ');
+}
+
+std::string apply_prefix(const std::string &prefix, const std::string &s)
+{
+    std::string result = prefix;
+    for (std::string::size_type i = 0; i < s.length(); ++i)
+    {
+        result += s[i];
+        if ((s[i] == '\n') && ((i + 1) < s.length()))
+        {
+            result.append(prefix);
+        }
+    }
+    return result;
+}
+
diff --git a/util.h b/util.h
new file mode 100644
index 0000000..c7967c6
--- /dev/null
+++ b/util.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <assert.h>
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+
+#define CANT_HAPPEN(s) \
+    do { \
+        std::stringstream stream; \
+        stream << __FILE__ << ":" << __LINE__ << ":" << s; \
+        throw std::runtime_error(stream.str()); \
+    } \
+    while (false)
+
+#ifdef LOG
+    #define TRACE(s) \
+        do { \
+            std::stringstream prefix; \
+            prefix << __FILE__ << ":" << __LINE__ << "\t" <<  \
+                      boost::this_thread::get_id() << "\t"; \
+            std::stringstream message; \
+            message << s; \
+            log(apply_prefix(prefix.str(), message.str())); \
+        } \
+        while (false)
+#else
+    #define TRACE(s) \
+        do { \
+        } \
+        while (false)
+#endif
+
+// Avoid spurious "unused variable" warnings from regular assert().
+#ifndef NDEBUG
+    #define ASSERT_EQUAL(x, y) assert((x) == (y))
+#else
+    #define ASSERT_EQUAL(x, y) \
+        do { \
+            x = x; \
+        } \
+        while (0);
+#endif
+
+extern boost::mutex log_mutex;
+void log(const std::string &s);
+void die(const char *s);
+
+std::string spaces(int n);
+std::string apply_prefix(const std::string &prefix, const std::string &s);
+
+#endif
diff --git a/valgrind.h b/valgrind.h
new file mode 100644
index 0000000..222a58e
--- /dev/null
+++ b/valgrind.h
@@ -0,0 +1,4060 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2011 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+
+/* ------------------------------------------------------------------ */
+/* VERSION NUMBER OF VALGRIND                                         */
+/* ------------------------------------------------------------------ */
+
+/* Specify Valgrind's version number, so that user code can
+   conditionally compile based on our version number.  Note that these
+   were introduced at version 3.6 and so do not exist in version 3.5
+   or earlier.  The recommended way to use them to check for "version
+   X.Y or later" is (eg)
+
+#if defined(__VALGRIND_MAJOR__) && defined(__VALGRIND_MINOR__)   \
+    && (__VALGRIND_MAJOR__ > 3                                   \
+        || (__VALGRIND_MAJOR__ == 3 && __VALGRIND_MINOR__ >= 6))
+*/
+#define __VALGRIND_MAJOR__    3
+#define __VALGRIND_MINOR__    6
+
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is).
+
+   Misc note: how to find out what's predefined in gcc by default:
+   gcc -Wp,-dM somefile.c
+*/
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+
+
+#if defined(__APPLE__) && defined(__i386__)
+#  define PLAT_x86_darwin 1
+#elif defined(__APPLE__) && defined(__x86_64__)
+#  define PLAT_amd64_darwin 1
+#elif defined(__MINGW32__) || defined(__CYGWIN32__) \
+      || (defined(_WIN32) && defined(_M_IX86))
+#  define PLAT_x86_win32 1
+#elif defined(__linux__) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif defined(__linux__) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(__linux__) && defined(__arm__)
+#  define PLAT_arm_linux 1
+#elif defined(__linux__) && defined(__s390__) && defined(__s390x__)
+#  define PLAT_s390x_linux 1
+#else
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+/*
+ * VALGRIND_DO_CLIENT_REQUEST(): a statement that invokes a Valgrind client
+ * request. Accepts both pointers and integers as arguments.
+ *
+ * VALGRIND_DO_CLIENT_REQUEST_STMT(): a statement that invokes a Valgrind
+ * client request that does not return a value.
+
+ * VALGRIND_DO_CLIENT_REQUEST_EXPR(): a C expression that invokes a Valgrind
+ * client request and whose value equals the client request result.  Accepts
+ * both pointers and integers as arguments.  Note that such calls are not
+ * necessarily pure functions -- they may have side effects.
+ */
+
+#define VALGRIND_DO_CLIENT_REQUEST(_zzq_rlval, _zzq_default,            \
+                                   _zzq_request, _zzq_arg1, _zzq_arg2,  \
+                                   _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  do { (_zzq_rlval) = VALGRIND_DO_CLIENT_REQUEST_EXPR((_zzq_default),   \
+                        (_zzq_request), (_zzq_arg1), (_zzq_arg2),       \
+                        (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#define VALGRIND_DO_CLIENT_REQUEST_STMT(_zzq_request, _zzq_arg1,        \
+                           _zzq_arg2,  _zzq_arg3, _zzq_arg4, _zzq_arg5) \
+  do { (void) VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                        \
+                    (_zzq_request), (_zzq_arg1), (_zzq_arg2),           \
+                    (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+      (_zzq_default)
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-{linux,darwin} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)  \
+    ||  (defined(PLAT_x86_win32) && defined(__GNUC__))
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                   \
+  ({volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || (PLAT_x86_win32 && __GNUC__) */
+
+/* ------------------------- x86-Win32 ------------------------- */
+
+#if defined(PLAT_x86_win32) && !defined(__GNUC__)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#if defined(_MSC_VER)
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     __asm rol edi, 3  __asm rol edi, 13          \
+                     __asm rol edi, 29 __asm rol edi, 19
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    valgrind_do_client_request_expr((uintptr_t)(_zzq_default),    \
+        (uintptr_t)(_zzq_request), (uintptr_t)(_zzq_arg1),        \
+        (uintptr_t)(_zzq_arg2), (uintptr_t)(_zzq_arg3),           \
+        (uintptr_t)(_zzq_arg4), (uintptr_t)(_zzq_arg5))
+
+static __inline uintptr_t
+valgrind_do_client_request_expr(uintptr_t _zzq_default, uintptr_t _zzq_request,
+                                uintptr_t _zzq_arg1, uintptr_t _zzq_arg2,
+                                uintptr_t _zzq_arg3, uintptr_t _zzq_arg4,
+                                uintptr_t _zzq_arg5)
+{
+    volatile uintptr_t _zzq_args[6];
+    volatile unsigned int _zzq_result;
+    _zzq_args[0] = (uintptr_t)(_zzq_request);
+    _zzq_args[1] = (uintptr_t)(_zzq_arg1);
+    _zzq_args[2] = (uintptr_t)(_zzq_arg2);
+    _zzq_args[3] = (uintptr_t)(_zzq_arg3);
+    _zzq_args[4] = (uintptr_t)(_zzq_arg4);
+    _zzq_args[5] = (uintptr_t)(_zzq_arg5);
+    __asm { __asm lea eax, _zzq_args __asm mov edx, _zzq_default
+            __SPECIAL_INSTRUCTION_PREAMBLE
+            /* %EDX = client_request ( %EAX ) */
+            __asm xchg ebx,ebx
+            __asm mov _zzq_result, edx
+    }
+    return _zzq_result;
+}
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                        \
+            /* %EAX = guest_NRADDR */                             \
+            __asm xchg ecx,ecx                                    \
+            __asm mov __addr, eax                                 \
+    }                                                             \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX ERROR
+
+#else
+#error Unsupported compiler.
+#endif
+
+#endif /* PLAT_x86_win32 */
+
+/* ------------------------ amd64-{linux,darwin} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    __extension__                                                 \
+    ({ volatile unsigned long long int _zzq_args[6];              \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+    __extension__                                                 \
+  ({         unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long long int  _zzq_args[6];                \
+             unsigned long long int  _zzq_result;                 \
+             unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long long int __addr;                                \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "mov r12, r12, ror #3  ; mov r12, r12, ror #13 \n\t"  \
+            "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned int  _zzq_args[6];                          \
+    volatile unsigned int  _zzq_result;                           \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("mov r3, %1\n\t" /*default*/                 \
+                     "mov r4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = client_request ( R4 ) */             \
+                     "orr r10, r10, r10\n\t"                      \
+                     "mov %0, r3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "cc","memory", "r3", "r4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = guest_NRADDR */                      \
+                     "orr r11, r11, r11\n\t"                      \
+                     "mov %0, r3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R4 */        \
+                     "orr r12, r12, r12\n\t"
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ s390x-linux ------------------------ */
+
+#if defined(PLAT_s390x_linux)
+
+typedef
+  struct {
+     unsigned long long int nraddr; /* where's the code? */
+  }
+  OrigFn;
+
+/* __SPECIAL_INSTRUCTION_PREAMBLE will be used to identify Valgrind specific
+ * code. This detection is implemented in platform specific toIR.c
+ * (e.g. VEX/priv/guest_s390_decoder.c).
+ */
+#define __SPECIAL_INSTRUCTION_PREAMBLE                           \
+                     "lr 15,15\n\t"                              \
+                     "lr 1,1\n\t"                                \
+                     "lr 2,2\n\t"                                \
+                     "lr 3,3\n\t"
+
+#define __CLIENT_REQUEST_CODE "lr 2,2\n\t"
+#define __GET_NR_CONTEXT_CODE "lr 3,3\n\t"
+#define __CALL_NO_REDIR_CODE  "lr 4,4\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                         \
+       _zzq_default, _zzq_request,                               \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                  \
+ ({volatile unsigned long long int _zzq_args[6];                 \
+   volatile unsigned long long int _zzq_result;                  \
+   _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+   _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+   _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+   _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+   _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+   _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+   __asm__ volatile(/* r2 = args */                              \
+                    "lgr 2,%1\n\t"                               \
+                    /* r3 = default */                           \
+                    "lgr 3,%2\n\t"                               \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CLIENT_REQUEST_CODE                        \
+                    /* results = r3 */                           \
+                    "lgr %0, 3\n\t"                              \
+                    : "=d" (_zzq_result)                         \
+                    : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                    : "cc", "2", "3", "memory"                   \
+                   );                                            \
+   _zzq_result;                                                  \
+ })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                      \
+ { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+   volatile unsigned long long int __addr;                       \
+   __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __GET_NR_CONTEXT_CODE                        \
+                    "lgr %0, 3\n\t"                              \
+                    : "=a" (__addr)                              \
+                    :                                            \
+                    : "cc", "3", "memory"                        \
+                   );                                            \
+   _zzq_orig->nraddr = __addr;                                   \
+ }
+
+#define VALGRIND_CALL_NOREDIR_R1                                 \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CALL_NO_REDIR_CODE
+
+#endif /* PLAT_s390x_linux */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h.  NOTE also: inserts
+   the default behaviour equivalance class tag "0000" into the name.
+   See pub_tool_redir.h for details -- normally you don't need to
+   think about this, though. */
+
+/* Use an extra level of macroisation so as to ensure the soname/fnname
+   args are fully macro-expanded before pasting them together. */
+#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZU_,soname,_,fnname)
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZZ_,soname,_,fnname)
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4)                \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0)
+
+#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5)             \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0)
+
+#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0)
+
+#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7)   \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0)
+
+/* ------------------------- x86-{linux,darwin} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin */
+
+/* ------------------------ amd64-{linux,darwin} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* This is all pretty complex.  It's so as to make stack unwinding
+   work reliably.  See bug 243270.  The basic problem is the sub and
+   add of 128 of %rsp in all of the following macros.  If gcc believes
+   the CFA is in %rsp, then unwinding may fail, because what's at the
+   CFA is not what gcc "expected" when it constructs the CFIs for the
+   places where the macros are instantiated.
+
+   But we can't just add a CFI annotation to increase the CFA offset
+   by 128, to match the sub of 128 from %rsp, because we don't know
+   whether gcc has chosen %rsp as the CFA at that point, or whether it
+   has chosen some other register (eg, %rbp).  In the latter case,
+   adding a CFI annotation to change the CFA offset is simply wrong.
+
+   So the solution is to get hold of the CFA using
+   __builtin_dwarf_cfa(), put it in a known register, and add a
+   CFI annotation to say what the register is.  We choose %rbp for
+   this (perhaps perversely), because:
+
+   (1) %rbp is already subject to unwinding.  If a new register was
+       chosen then the unwinder would have to unwind it in all stack
+       traces, which is expensive, and
+
+   (2) %rbp is already subject to precise exception updates in the
+       JIT.  If a new register was chosen, we'd have to have precise
+       exceptions for it too, which reduces performance of the
+       generated code.
+
+   However .. one extra complication.  We can't just whack the result
+   of __builtin_dwarf_cfa() into %rbp and then add %rbp to the
+   list of trashed registers at the end of the inline assembly
+   fragments; gcc won't allow %rbp to appear in that list.  Hence
+   instead we need to stash %rbp in %r15 for the duration of the asm,
+   and say that %r15 is trashed instead.  gcc seems happy to go with
+   that.
+
+   Oh .. and this all needs to be conditionalised so that it is
+   unchanged from before this commit, when compiled with older gccs
+   that don't support __builtin_dwarf_cfa.  Furthermore, since
+   this header file is freestanding, it has to be independent of
+   config.h, and so the following conditionalisation cannot depend on
+   configure time checks.
+
+   Although it's not clear from
+   'defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)',
+   this expression excludes Darwin.
+   .cfi directives in Darwin assembly appear to be completely
+   different and I haven't investigated how they work.
+
+   For even more entertainment value, note we have to use the
+   completely undocumented __builtin_dwarf_cfa(), which appears to
+   really compute the CFA, whereas __builtin_frame_address(0) claims
+   to but actually doesn't.  See
+   https://bugs.kde.org/show_bug.cgi?id=243270#c47
+*/
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"r"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "movq %%rbp, %%r15\n\t"                                     \
+      "movq %2, %%rbp\n\t"                                        \
+      ".cfi_remember_state\n\t"                                   \
+      ".cfi_def_cfa rbp, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "movq %%r15, %%rbp\n\t"                                     \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4","r14"
+
+/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory",  __CALLER_SAVED_REGS         \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #4 \n\t"                                    \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #8 \n\t"                                    \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #12 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "push {r0, r1, r2, r3} \n\t"                             \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #16 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #20 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #24 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #28 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory",__CALLER_SAVED_REGS           \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "ldr r2, [%1, #48] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #32 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------- s390x-linux ------------------------- */
+
+#if defined(PLAT_s390x_linux)
+
+/* Similar workaround as amd64 (see above), but we use r11 as frame
+   pointer and save the old r11 in r7. r11 might be used for
+   argvec, therefore we copy argvec in r1 since r1 is clobbered
+   after the call anyway.  */
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"d"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      ".cfi_remember_state\n\t"                                   \
+      "lgr 1,%1\n\t" /* copy the argvec pointer in r1 */          \
+      "lgr 7,11\n\t"                                              \
+      "lgr 11,%2\n\t"                                             \
+      ".cfi_def_cfa r11, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "lgr 11, 7\n\t"                                             \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "lgr 1,%1\n\t"
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+
+
+
+/* These regs are trashed by the hidden call. Note that we overwrite
+   r14 in s390_irgen_noredir (VEX/priv/guest_s390_irgen.c) to give the
+   function a proper return address. All others are ABI defined call
+   clobbers. */
+#define __CALLER_SAVED_REGS "0","1","2","3","4","5","14", \
+                           "f0","f1","f2","f3","f4","f5","f6","f7"
+
+
+#define CALL_FN_W_v(lval, orig)                                  \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long  _argvec[1];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 1, 0(1)\n\t"  /* target->r1 */                      \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "d" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+/* The call abi has the arguments in r2-r6 and stack */
+#define CALL_FN_W_W(lval, orig, arg1)                            \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[2];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1, arg2)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[3];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1, arg2, arg3)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[4];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1, arg2, arg3, arg4)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[5];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1, arg2, arg3, arg4, arg5)   \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[6];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6)                                       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[7];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-168\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,168\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7)                                 \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[8];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-176\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,176\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8)                           \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[9];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-184\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,184\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8, arg9)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[10];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-192\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,192\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[11];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-200\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,200\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[12];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-208\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,208\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11, arg12)\
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[13];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      _argvec[12] = (unsigned long)arg12;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-216\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "mvc 208(8,15), 96(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,216\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+
+#endif /* PLAT_s390x_linux */
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* Allows a string (gdb monitor command) to be passed to the tool
+             Used for interaction with vgdb/gdb */
+          VG_USERREQ__GDB_MONITOR_COMMAND = 0x1202,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__RESIZEINPLACE_BLOCK = 0x130b,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          /* The first two pass the va_list argument by value, which
+             assumes it is the same size as or smaller than a UWord,
+             which generally isn't the case.  Hence are deprecated.
+             The second two pass the vargs by reference and so are
+             immune to this problem. */
+          /* both :: char* fmt, va_list vargs (DEPRECATED) */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+          /* both :: char* fmt, va_list* vargs */
+          VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403,
+          VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503,
+
+          /* Wine support */
+          VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601,
+
+          /* Querying of debug info. */
+          VG_USERREQ__MAP_IP_TO_SRCLOC = 0x1701,
+
+          /* Disable/enable error reporting level.  Takes a single
+             Word arg which is the delta to this thread's error
+             disablement indicator.  Hence 1 disables or further
+             disables errors, and -1 moves back towards enablement.
+             Other values are not allowed. */
+          VG_USERREQ__CHANGE_ERR_DISABLEMENT = 0x1801
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND                                           \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* if not */,         \
+                                    VG_USERREQ__RUNNING_ON_VALGRIND,  \
+                                    0, 0, 0, 0, 0)                    \
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                                    _qzz_addr, _qzz_len, 0, 0, 0)
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack.  The return value
+   is the number of characters printed, excluding the "**<pid>** " part at the
+   start and the backtrace (if present). */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,       \
+                                    VG_USERREQ__CLIENT_CALL0,     \
+                                    _qyy_fn,                      \
+                                    0, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)                    \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL1,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)         \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL2,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, _qyy_arg2, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,             \
+                                    VG_USERREQ__CLIENT_CALL3,           \
+                                    _qyy_fn,                            \
+                                    _qyy_arg1, _qyy_arg2,               \
+                                    _qyy_arg3, 0)
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(                    \
+                               0 /* default return */,            \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0)
+
+/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing
+   when heap blocks are allocated in order to give accurate results.  This
+   happens automatically for the standard allocator functions such as
+   malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete,
+   delete[], etc.
+
+   But if your program uses a custom allocator, this doesn't automatically
+   happen, and Valgrind will not do as well.  For example, if you allocate
+   superblocks with mmap() and then allocates chunks of the superblocks, all
+   Valgrind's observations will be at the mmap() level and it won't know that
+   the chunks should be considered separate entities.  In Memcheck's case,
+   that means you probably won't get heap block overrun detection (because
+   there won't be redzones marked as unaddressable) and you definitely won't
+   get any leak detection.
+
+   The following client requests allow a custom allocator to be annotated so
+   that it can be handled accurately by Valgrind.
+
+   VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated
+   by a malloc()-like function.  For Memcheck (an illustrative case), this
+   does two things:
+
+   - It records that the block has been allocated.  This means any addresses
+     within the block mentioned in error messages will be
+     identified as belonging to the block.  It also means that if the block
+     isn't freed it will be detected by the leak checker.
+
+   - It marks the block as being addressable and undefined (if 'is_zeroed' is
+     not set), or addressable and defined (if 'is_zeroed' is set).  This
+     controls how accesses to the block by the program are handled.
+   
+   'addr' is the start of the usable block (ie. after any
+   redzone), 'sizeB' is its size.  'rzB' is the redzone size if the allocator
+   can apply redzones -- these are blocks of padding at the start and end of
+   each block.  Adding redzones is recommended as it makes it much more likely
+   Valgrind will spot block overruns.  `is_zeroed' indicates if the memory is
+   zeroed (or filled with another predictable value), as is the case for
+   calloc().
+   
+   VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a
+   heap block -- that will be used by the client program -- is allocated.
+   It's best to put it at the outermost level of the allocator if possible;
+   for example, if you have a function my_alloc() which calls
+   internal_alloc(), and the client request is put inside internal_alloc(),
+   stack traces relating to the heap block will contain entries for both
+   my_alloc() and internal_alloc(), which is probably not what you want.
+
+   For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out
+   custom blocks from within a heap block, B, that has been allocated with
+   malloc/calloc/new/etc, then block B will be *ignored* during leak-checking
+   -- the custom blocks will take precedence.
+
+   VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK.  For
+   Memcheck, it does two things:
+
+   - It records that the block has been deallocated.  This assumes that the
+     block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - It marks the block as being unaddressable.
+
+   VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a
+   heap block is deallocated.
+
+   VALGRIND_RESIZEINPLACE_BLOCK informs a tool about reallocation. For
+   Memcheck, it does four things:
+
+   - It records that the size of a block has been changed.  This assumes that
+     the block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - If the block shrunk, it marks the freed memory as being unaddressable.
+
+   - If the block grew, it marks the new area as undefined and defines a red
+     zone past the end of the new block.
+
+   - The V-bits of the overlap between the old and the new block are preserved.
+
+   VALGRIND_RESIZEINPLACE_BLOCK should be put after allocation of the new block
+   and before deallocation of the old block.
+
+   In many cases, these three client requests will not be enough to get your
+   allocator working well with Memcheck.  More specifically, if your allocator
+   writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call
+   will be necessary to mark the memory as addressable just before the zeroing
+   occurs, otherwise you'll get a lot of invalid write errors.  For example,
+   you'll need to do this if your allocator recycles freed blocks, but it
+   zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK).
+   Alternatively, if your allocator reuses freed blocks for allocator-internal
+   data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary.
+
+   Really, what's happening is a blurring of the lines between the client
+   program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the
+   memory should be considered unaddressable to the client program, but the
+   allocator knows more than the rest of the client program and so may be able
+   to safely access it.  Extra client requests are necessary for Valgrind to
+   understand the distinction between the allocator and the rest of the
+   program.
+
+   Ignored if addr == 0.
+*/
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)          \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MALLOCLIKE_BLOCK,       \
+                                    addr, sizeB, rzB, is_zeroed, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__RESIZEINPLACE_BLOCK,    \
+                                    addr, oldSizeB, newSizeB, rzB, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__FREELIKE_BLOCK,         \
+                                    addr, rzB, 0, 0, 0)
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,   \
+                                    pool, rzB, is_zeroed, 0, 0)
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DESTROY_MEMPOOL,  \
+                                    pool, 0, 0, 0, 0)
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_ALLOC,    \
+                                    pool, addr, size, 0, 0)
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_FREE,     \
+                                    pool, addr, 0, 0, 0)
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_TRIM,     \
+                                    pool, addr, size, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MOVE_MEMPOOL,     \
+                                    poolA, poolB, 0, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_CHANGE,   \
+                                    pool, addrA, addrB, size, 0)
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0)
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0)
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_DEREGISTER, \
+                                    id, 0, 0, 0, 0)
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_CHANGE,     \
+                                    id, start, end, 0, 0)
+
+/* Load PDB debug info for Wine PE image_map. */
+#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__LOAD_PDB_DEBUGINFO, \
+                                    fd, ptr, total_size, delta, 0)
+
+/* Map a code address to a source file name and line number.  buf64
+   must point to a 64-byte buffer in the caller's address space.  The
+   result will be dumped in there and is guaranteed to be zero
+   terminated.  If no info is found, the first byte is set to zero. */
+#define VALGRIND_MAP_IP_TO_SRCLOC(addr, buf64)                    \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MAP_IP_TO_SRCLOC,      \
+                               addr, buf64, 0, 0, 0)
+
+/* Disable error reporting for this thread.  Behaves in a stack like
+   way, so you can safely call this multiple times provided that
+   VALGRIND_ENABLE_ERROR_REPORTING is called the same number of times
+   to re-enable reporting.  The first call of this macro disables
+   reporting.  Subsequent calls have no effect except to increase the
+   number of VALGRIND_ENABLE_ERROR_REPORTING calls needed to re-enable
+   reporting.  Child threads do not inherit this setting from their
+   parents -- they are always created with reporting enabled. */
+#define VALGRIND_DISABLE_ERROR_REPORTING                                \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    1, 0, 0, 0, 0)
+
+/* Re-enable error reporting, as per comments on
+   VALGRIND_DISABLE_ERROR_REPORTING. */
+#define VALGRIND_ENABLE_ERROR_REPORTING                                 \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    -1, 0, 0, 0, 0)
+
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+
+#endif   /* __VALGRIND_H */