From 5d5ddba7f10596fc574fb0639012d720b9d66c62 Mon Sep 17 00:00:00 2001
From: Steven Flintham <sgf@lemma.co.uk>
Date: Wed, 25 Jun 2014 18:47:24 +0100
Subject: [PATCH] First public release

---
 .gitignore                                 |   42 +
 AddressRange.cpp                           |   42 +
 AddressRange.h                             |  101 +
 AddressSet.cpp                             |   95 +
 AddressSet.h                               |   66 +
 COPYING                                    |   60 +
 CREDITS                                    |   33 +
 Function.cpp                               |  417 ++
 Function.h                                 |  112 +
 FunctionBuilder.cpp                        | 3571 +++++++++++++++++
 FunctionBuilder.h                          |  364 ++
 FunctionManager.cpp                        |  310 ++
 FunctionManager.h                          |  151 +
 JitBool.h                                  |   32 +
 LLVMStuff.cpp                              |   41 +
 LLVMStuff.h                                |   39 +
 M6502Internal.h                            |   43 +
 Makefile.am                                |  130 +
 README                                     |   84 +
 README.lib6502                             |  136 +
 Registers.cpp                              |   59 +
 Registers.h                                |   51 +
 TODO                                       |   67 +
 build-aux/tap-driver.sh                    |  649 ++++
 config.h.in                                |   89 +
 configure.ac                               |   94 +
 const.h                                    |   58 +
 examples/README                            |  406 ++
 examples/hex2bin                           |    6 +
 examples/lib1.c                            |  108 +
 lib6502-compatibility.txt                  |   54 +
 lib6502-jit.cpp                            |  190 +
 lib6502.c                                  |  910 +++++
 lib6502.h                                  |  120 +
 m4/boost.m4                                | 1338 +++++++
 man/M6502_delete.3                         |    1 +
 man/M6502_disassemble.3                    |    1 +
 man/M6502_dump.3                           |    1 +
 man/M6502_getCallback.3                    |    1 +
 man/M6502_getVector.3                      |    1 +
 man/M6502_irq.3                            |    1 +
 man/M6502_new.3                            |    1 +
 man/M6502_nmi.3                            |    1 +
 man/M6502_reset.3                          |    1 +
 man/M6502_run.3                            |    1 +
 man/M6502_setCallback.3                    |    1 +
 man/M6502_setMode.3                        |    1 +
 man/M6502_setVector.3                      |    1 +
 man/lib6502.3                              |  555 +++
 man/run6502.1                              |  396 ++
 run6502.c                                  |  599 +++
 test/addr-wrap-1.mst                       |    1 +
 test/addr-wrap-1.xa                        |   25 +
 test/basic-callback.c                      |  122 +
 test/basic-callback.mst                    |   33 +
 test/call-illegal-callback-modify-code.c   |  121 +
 test/call-illegal-callback-modify-code.mst |   20 +
 test/config.xa                             |    4 +
 test/interleave.mst                        |    1 +
 test/interleave.xa                         |   38 +
 test/irq-nmi.c                             |  116 +
 test/irq-nmi.mst                           |   21 +
 test/pc-wrap-1.mst                         |    1 +
 test/pc-wrap-1.xa                          |   28 +
 test/pc-wrap-2.mst                         |    1 +
 test/pc-wrap-2.xa                          |   28 +
 test/run-c-tests.py                        |   33 +
 test/run-c-tests.sh                        |    2 +
 test/run-run6502-tests.py                  |   59 +
 test/run-run6502-tests.sh                  |    2 +
 test/setjmp-trick.c                        |  125 +
 test/setjmp-trick.mst                      |   35 +
 test/stack-code-brk.c                      |  108 +
 test/stack-code-brk.mst                    |    1 +
 test/stack-code-jsr.c                      |   90 +
 test/stack-code-jsr.mst                    |    3 +
 test/test-utils.c                          |  106 +
 test/test-utils.h                          |   30 +
 test/trivial-test.mst                      |    1 +
 test/trivial-test.xa                       |    5 +
 test/write-callback-modify-code.c          |  100 +
 test/write-callback-modify-code.mst        |    3 +
 test/z-self-modify-1.mst                   |    1 +
 test/z-self-modify-1.xa                    |   94 +
 test/z-self-modify-2.mst                   |    1 +
 test/z-self-modify-2.xa                    |  125 +
 util.cpp                                   |   57 +
 util.h                                     |   73 +
 valgrind.h                                 | 4060 ++++++++++++++++++++
 89 files changed, 17305 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 AddressRange.cpp
 create mode 100644 AddressRange.h
 create mode 100644 AddressSet.cpp
 create mode 100644 AddressSet.h
 create mode 100644 COPYING
 create mode 100644 CREDITS
 create mode 100644 Function.cpp
 create mode 100644 Function.h
 create mode 100644 FunctionBuilder.cpp
 create mode 100644 FunctionBuilder.h
 create mode 100644 FunctionManager.cpp
 create mode 100644 FunctionManager.h
 create mode 100644 JitBool.h
 create mode 100644 LLVMStuff.cpp
 create mode 100644 LLVMStuff.h
 create mode 100644 M6502Internal.h
 create mode 100644 Makefile.am
 create mode 100644 README
 create mode 100644 README.lib6502
 create mode 100644 Registers.cpp
 create mode 100644 Registers.h
 create mode 100644 TODO
 create mode 100755 build-aux/tap-driver.sh
 create mode 100644 config.h.in
 create mode 100644 configure.ac
 create mode 100644 const.h
 create mode 100644 examples/README
 create mode 100755 examples/hex2bin
 create mode 100644 examples/lib1.c
 create mode 100644 lib6502-compatibility.txt
 create mode 100644 lib6502-jit.cpp
 create mode 100644 lib6502.c
 create mode 100644 lib6502.h
 create mode 100644 m4/boost.m4
 create mode 100644 man/M6502_delete.3
 create mode 100644 man/M6502_disassemble.3
 create mode 100644 man/M6502_dump.3
 create mode 100644 man/M6502_getCallback.3
 create mode 100644 man/M6502_getVector.3
 create mode 100644 man/M6502_irq.3
 create mode 100644 man/M6502_new.3
 create mode 100644 man/M6502_nmi.3
 create mode 100644 man/M6502_reset.3
 create mode 100644 man/M6502_run.3
 create mode 100644 man/M6502_setCallback.3
 create mode 100644 man/M6502_setMode.3
 create mode 100644 man/M6502_setVector.3
 create mode 100644 man/lib6502.3
 create mode 100644 man/run6502.1
 create mode 100644 run6502.c
 create mode 100644 test/addr-wrap-1.mst
 create mode 100644 test/addr-wrap-1.xa
 create mode 100644 test/basic-callback.c
 create mode 100644 test/basic-callback.mst
 create mode 100644 test/call-illegal-callback-modify-code.c
 create mode 100644 test/call-illegal-callback-modify-code.mst
 create mode 100644 test/config.xa
 create mode 100644 test/interleave.mst
 create mode 100644 test/interleave.xa
 create mode 100644 test/irq-nmi.c
 create mode 100644 test/irq-nmi.mst
 create mode 100644 test/pc-wrap-1.mst
 create mode 100644 test/pc-wrap-1.xa
 create mode 100644 test/pc-wrap-2.mst
 create mode 100644 test/pc-wrap-2.xa
 create mode 100755 test/run-c-tests.py
 create mode 100755 test/run-c-tests.sh
 create mode 100755 test/run-run6502-tests.py
 create mode 100755 test/run-run6502-tests.sh
 create mode 100644 test/setjmp-trick.c
 create mode 100644 test/setjmp-trick.mst
 create mode 100644 test/stack-code-brk.c
 create mode 100644 test/stack-code-brk.mst
 create mode 100644 test/stack-code-jsr.c
 create mode 100644 test/stack-code-jsr.mst
 create mode 100644 test/test-utils.c
 create mode 100644 test/test-utils.h
 create mode 100644 test/trivial-test.mst
 create mode 100644 test/trivial-test.xa
 create mode 100644 test/write-callback-modify-code.c
 create mode 100644 test/write-callback-modify-code.mst
 create mode 100644 test/z-self-modify-1.mst
 create mode 100644 test/z-self-modify-1.xa
 create mode 100644 test/z-self-modify-2.mst
 create mode 100644 test/z-self-modify-2.xa
 create mode 100644 util.cpp
 create mode 100644 util.h
 create mode 100644 valgrind.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1be6b44
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,42 @@
+*~
+*.o
+*.lo
+.deps
+.libs
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+config.guess
+config.h
+config.log
+config.status
+config.sub
+configure
+depcomp
+examples/.dirstamp
+examples/lib1
+install-sh
+lib6502-jit*
+lib6502-jit*
+libtool
+ltmain.sh
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+missing
+run6502
+stamp-h1
+test/.dirstamp
+test/*.mc
+test/basic-callback
+test/call-illegal-callback-modify-code
+test/irq-nmi
+test/setjmp-trick
+test/stack-code-brk
+test/stack-code-jsr
+test/write-callback-modify-code
+test/z-self-modify-1.mc
+test/z-self-modify-1.out
diff --git a/AddressRange.cpp b/AddressRange.cpp
new file mode 100644
index 0000000..1ec95cb
--- /dev/null
+++ b/AddressRange.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "AddressRange.h"
+
+#include <assert.h>
+
+#include "const.h"
+
+AddressRange::AddressRange(uint16_t addr)
+: range_begin_(addr), range_end_(range_begin_ + 1)
+{
+}
+
+AddressRange::AddressRange(uint32_t range_begin, uint32_t range_end)
+: range_begin_(range_begin), range_end_(range_end)
+{
+    assert(range_begin_ < memory_size);
+    assert(range_end_ <= (memory_size + 0xff));
+    assert(range_begin_ < range_end_);
+}
+
+bool AddressRange::all_memory() const
+{
+    // This doesn't catch some degenerate cases (e.g. range_begin_ = 0x1,
+    // range_end_ = 0x10002) but that doesn't matter.
+    return (range_begin_ == 0) && (range_end_ == memory_size);
+}
diff --git a/AddressRange.h b/AddressRange.h
new file mode 100644
index 0000000..f03744a
--- /dev/null
+++ b/AddressRange.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+// An AddressRange represents a contiguous range of addresses in the emulated
+// memory, expressed as a half-open interval ("begin" is included, "end" is
+// excluded). To allow convenient handling of cases where addresses wrap around
+// at the top of memory, end may be as large as 0x100ff; this allows the
+// effective address range of an instruction like LDA &ffff,Y to be represented.
+// (The "largest" address accessed is &00fe, and since the interval is half-open
+// end needs to allow a value one larger.)
+
+#ifndef ADDRESSRANGE_H
+#define ADDRESSRANGE_H
+
+#include <stdint.h>
+
+class AddressRange
+{
+public:
+    // Convenience function; equivalent to AddressRange(addr, addr + 1) without
+    // any need to worry about whether addr + 1 will wrap to 0.
+    AddressRange(uint16_t addr);
+
+    AddressRange(uint32_t range_begin, uint32_t range_end);
+
+    uint32_t range_begin() const
+    {
+        return range_begin_;
+    }
+
+    uint32_t range_end() const
+    {
+        return range_end_;
+    }
+
+    // Return true iff AddressRange covers the whole of memory.
+    bool all_memory() const;
+
+    class const_iterator
+    {
+    friend class AddressRange;
+
+    public:
+        uint16_t operator*() const
+        {
+            // Truncating down to 16 bits gives exactly the behaviour we
+            // require if this is a range which uses values >= 0x10000 to
+            // indicate wrapping around to the start of memory.
+            return static_cast<uint16_t>(v_);
+        }
+
+        const_iterator &operator++()
+        {
+            ++v_;
+            return *this;
+        }
+
+        bool operator!=(const const_iterator &rhs)
+        {
+            return v_ != rhs.v_;
+        }
+
+    private:
+        const_iterator(uint32_t v)
+        : v_(v)
+        {
+        }
+
+        uint32_t v_;
+    };
+
+    const_iterator begin() const
+    {
+        return const_iterator(range_begin_);
+    }
+
+    const_iterator end() const
+    {
+        return const_iterator(range_end_);
+    }
+
+private:
+    uint32_t range_begin_;
+    uint32_t range_end_;
+};
+
+#endif
diff --git a/AddressSet.cpp b/AddressSet.cpp
new file mode 100644
index 0000000..971ba5a
--- /dev/null
+++ b/AddressSet.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "AddressSet.h"
+
+#include <assert.h>
+#include <sstream>
+#include <stddef.h>
+
+#include "AddressRange.h"
+#include "util.h"
+
+void AddressSet::insert(uint16_t address)
+{
+    set_.insert(address);
+}
+
+void AddressSet::insert(const AddressRange &range)
+{
+    for (AddressRange::const_iterator it = range.begin(); it != range.end(); 
+         ++it)
+    {
+        set_.insert(*it);
+    }
+}
+
+namespace
+{
+    std::string dump_range(uint32_t range_start, uint32_t range_end)
+    {
+        std::stringstream s;
+        s << std::hex << std::setfill('0');
+        if ((range_start + 1) == range_end)
+        {
+            s << "0x" << std::setw(4) << range_start;
+        }
+        else
+        {
+            // It's probably more readable to dump in this (inclusive) format
+            // than to insist on using the half-open intervals which are
+            // "natural" in the code itself.
+            s << "0x" << std::setw(4) << range_start << "-" <<
+                 "0x" << std::setw(4) << (range_end - 1);
+        }
+        return s.str();
+    }
+}
+
+std::string AddressSet::dump(int indent) const
+{
+    std::stringstream s;
+
+    bool in_range = false;
+    uint32_t range_start;
+    uint32_t range_last;
+    for (AddressSet::const_iterator it = set_.begin(); it != set_.end(); ++it)
+    {
+        uint16_t i = *it;
+        if (!in_range)
+        {
+            range_start = i;
+            range_last = i;
+            in_range = true;
+        }
+        else
+        {
+            if (i != (range_last + 1))
+            {
+                s << spaces(indent) << 
+                     dump_range(range_start, range_last + 1) << "\n";
+                range_start = i;
+            }
+            range_last = i;
+        }
+    }
+    if (in_range)
+    {
+        s << spaces(indent) << dump_range(range_start, range_last + 1) << "\n";
+    }
+    return s.str();
+}
diff --git a/AddressSet.h b/AddressSet.h
new file mode 100644
index 0000000..d9d8ef4
--- /dev/null
+++ b/AddressSet.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef ADDRESSSET_H
+#define ADDRESSSET_H
+
+#include <set>
+#include <stdint.h>
+#include <string>
+
+class AddressRange;
+
+class AddressSet
+{
+private:
+    // This might not be the perfect representation, but it's simple and clean,
+    // so let's stick with it unless profiling shows this is a problem.
+    typedef std::set<uint16_t> Container;
+
+public:
+    AddressSet()
+    {
+    }
+
+    void insert(uint16_t address);
+
+    void insert(const AddressRange &range);
+
+    typedef Container::const_iterator const_iterator;
+
+    const_iterator begin() const
+    {
+        return set_.begin();
+    }
+
+    const_iterator end() const
+    {
+        return set_.end();
+    }
+
+    Container::size_type size() const
+    {
+        return set_.size();
+    }
+
+    std::string dump(int indent) const;
+
+private:
+    std::set<uint16_t> set_;
+};
+
+#endif
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..2cf8818
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,60 @@
+TL;DR: If you're redistributing this you should read through the text below and
+examine the headers on the individual files, but basically the C/C++ source
+code (with the exception of valgrind.h, which can be removed if necessary) was
+all written by Ian Piumarta or Steven Flintham and is licensed under the "MIT
+(X11 flavour)" licence at the bottom of this file, just as lib6502 itself is.
+The autotools infrastructure support is GPL licensed but has exceptions for use
+(as is the case here) in autoconfigured packages.
+
+
+
+valgrind.h has its own license; see the comments at the top of that file.
+
+build-aux/tap-driver.sh (used as part of "make check") is GPLv2 licensed with
+an exception (which I believe applies to this package) allowing distribution
+under "the same distribution terms that you use for the rest of that program".
+See the comments at the top of that file for more details.
+
+m4/boost.m4 (used to autoconfigure the build against the Boost libraries) is
+GPLv3 licensed with an exception (which I believe applies to this package)
+allowing distribution under "terms of your choice". See the comments at the top
+of that file for more details.
+
+The text below is from Ian Piumarta's lib6502's COPYING file. lib6502-jit
+contains almost all of the code and documentation from lib6502 itself.
+
+As the author of the remaining parts of lib6502-jit, I am granting the same
+permissions and have added my own copyright notice, but the text below is
+otherwise unchanged. 
+
+-- Steven Flintham
+
+
+
+Distasteful though it is for me to have to induce from afar any perturbation
+into your pursuit of happiness, this MIT (X11 flavour) license is at least
+relatively benign.  Investigation into copyright stupidity reveals that it is
+effectively impossible to dedicate (formally) any software to the public
+domain (the only sure path to this most enlightened status being to leave the
+software to expire naturally from its 25-, 50-, 75- or whatever-year copyright
+rot).  I fear this is not going to change before the revolution comes.  In the
+meantime the only way I can *guarantee* you any rights at all to this software
+would (unfortunately) appear to be...
+
+  Copyright (c) 2005 Ian Piumarta
+  Copyright (c) 2014 Steven Flintham
+
+  All rights reserved.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the 'Software'), to
+  deal in the Software without restriction, including without limitation the
+  rights to use, copy, modify, merge, publish, distribute, and/or sell copies
+  of the Software, and to permit persons to whom the Software is furnished to
+  do so, provided that the above copyright notice(s) and this permission
+  notice appear in all copies or substantial portions of the Software.
+
+  Inclusion of the above copyright notice(s) and this permission notice in
+  supporting documentation would be appreciated, but is not required.
+
+  THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
diff --git a/CREDITS b/CREDITS
new file mode 100644
index 0000000..c7d726b
--- /dev/null
+++ b/CREDITS
@@ -0,0 +1,33 @@
+lib6502-jit implements (nearly) the same API as Ian Piumarta's lib6502
+(http://www.piumarta.com/software/lib6502/) and includes virtually all of
+lib6502's code and documentation with only minor modifications; the lib6502
+emulation code is used to implement the interpreted and hybrid emulation modes
+in lib6502-jit. The contents of the examples and man directories are almost
+verbatim copies of those in lib6502. Thanks to Ian for making lib6502
+available. Please do not send bug reports regarding lib6502-jit to Ian!
+
+This distribution itself doesn't contain any LLVM code, but obviously without
+the LLVM project lib6502-jit could not exist.
+
+valgrind.h is taken from Valgrind (http://valgrind.org/).
+
+build-aux/tap-driver.sh is part of GNU Automake and was taken from
+https://raw.githubusercontent.com/kergoth/automake/master/lib/tap-driver.sh.
+
+m4/boost.m4 (used to autoconfigure the build against the Boost libraries) is
+taken from https://github.com/tsuna/boost.m4.
+
+While I'd be lying if I said I enjoyed working with Autotools, I am grateful
+for the work people have put in to make it possible to build packages portably
+on a range of different platforms.
+
+The technique (but not the code) used to translate a JITted function's machine
+code into assembly in Function::dump_machine_code() is taken from the libjit
+(https://www.gnu.org/software/libjit/) dump_object_code() function.
+
+The algorithm used to implement ADC/SDC in decimal mode is taken from
+http://www.6502.org/tutorials/decimal_mode.html. The test program on the same
+page was used to validate the implementation.
+
+Klaus Dormann's "6502 functional test" and "65C02 extended opcodes test" were
+used to validate the behaviour of the emulation.
diff --git a/Function.cpp b/Function.cpp
new file mode 100644
index 0000000..d766bec
--- /dev/null
+++ b/Function.cpp
@@ -0,0 +1,417 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "Function.h"
+
+#include <errno.h>
+#include <sstream>
+#include <stdexcept>
+#include <unistd.h>
+#include "valgrind.h"
+
+#include "const.h"
+#include "LLVMStuff.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+// Note that we call update_memory_snapshot() after invoking callbacks here, but
+// not before. It would be correct to do so, but it's not necessary. Firstly, we
+// arrange that the memory snapshot is kept up-to-date during execution under
+// our control (i.e. not involving callbacks), so it isn't necessary. Secondly,
+// even if it were necessary, it would be redundant, since any actions needed
+// as a result of the update can wait until after the callback is called and the
+// call after the callback would perform them.
+
+namespace
+{
+    // We have the callback_pc argument to allow us to special-case the
+    // contents of the PC register for lib6502 compatibility. Without this
+    // we would always pass registers.pc, which is "address of the next
+    // instruction to execute if the callback doesn't intervene" in PC;
+    // this agrees with lib6502 for JMP (absolute and indirect) but not for JSR
+    // or BRK.
+    uint16_t handle_call_callback(M6502 *mpu, uint16_t callback_pc, 
+                                  uint8_t opcode)
+    {
+        Registers &registers = mpu->internal->registers_;
+        uint16_t default_next_pc = registers.pc;
+        if (mpu->callbacks->call[registers.pc] != 0)
+        {
+            registers.pc = callback_pc;
+            registers.to_M6502_Registers(mpu);
+            TRACE("Call callback, mpu " << mpu << ", address 0x" << std::hex << 
+                  std::setfill('0') << std::setw(4) << default_next_pc << 
+                  ", data 0x" << std::setw(2) << static_cast<int>(opcode));
+            uint16_t address = default_next_pc;
+            if (opcode == opcode_brk)
+            {
+                address = callback_pc - 2; // lib6502 does this
+            }
+            int callback_result = 
+                mpu->callbacks->call[default_next_pc](mpu, address, opcode);
+            TRACE("Callback returned 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << callback_result);
+            registers.from_M6502_Registers(mpu);
+            mpu->internal->function_manager_.update_memory_snapshot();
+            if (callback_result != 0)
+            {
+                return callback_result;
+            }
+        }
+        return default_next_pc;
+    }
+
+    uint16_t get_stacked_pc(M6502 *mpu, int offset)
+    {
+        uint8_t s = mpu->internal->registers_.s;
+
+        for (; offset > 0; --offset)
+        {
+            ++s;
+        }
+
+        ++s;
+        uint8_t pushed_pc_low = mpu->memory[0x100 + s];
+        ++s;
+        uint8_t pushed_pc_high = mpu->memory[0x100 + s];
+        return pushed_pc_low | (pushed_pc_high << 8);
+    }
+
+    uint16_t handle_push_and_control_transfer_opcode(
+        M6502 *mpu, uint16_t callback_pc, uint8_t opcode, int bytes_pushed)
+    {
+        assert(bytes_pushed >= 2);
+
+        uint8_t s = mpu->internal->registers_.s;
+        for (int i = 0; i < bytes_pushed; ++i)
+        {
+            ++s;
+            mpu->internal->function_manager_.code_modified_at(0x100 + s);
+        }
+
+        return handle_call_callback(mpu, callback_pc, opcode);
+    }
+}
+
+Function::Function(
+    M6502 *mpu, uint16_t address, const AddressSet &code_range, 
+    const AddressSet &optimistic_writes, llvm::Function *llvm_function)
+: mpu_(mpu),
+  llvm_stuff_(mpu->internal->llvm_stuff_),
+  address_(address),
+  code_range_(code_range),
+  optimistic_writes_(optimistic_writes),
+  llvm_function_(llvm_function),
+  jitted_function_(reinterpret_cast<Function::JitFunction>(
+    llvm_stuff_.execution_engine_->getPointerToFunction(llvm_function)))
+{
+    llvm_stuff_.execution_engine_->runJITOnFunction(llvm_function_, &mci_);
+}
+
+Function::~Function()
+{
+    TRACE("Destructor for Function at address " << std::hex << 
+          std::setfill('0') << std::setw(4) << address_);
+    
+    VALGRIND_DISCARD_TRANSLATIONS(mci_.address(), mci_.size());
+    llvm_function_->eraseFromParent();
+}
+
+void Function::handle_complex_result(FunctionBuilder::Result result) const
+{
+    Registers &registers = mpu_->internal->registers_;
+
+    switch (result)
+    {
+        case FunctionBuilder::result_control_transfer_direct:
+            CANT_HAPPEN("Direct case reached handle_complex_result()");
+
+        case FunctionBuilder::result_control_transfer_indirect:
+            registers.pc = handle_call_callback(mpu_, registers.pc, 
+                                                registers.data);
+            break;
+
+        case FunctionBuilder::result_brk:
+            registers.pc = handle_push_and_control_transfer_opcode(
+                mpu_, get_stacked_pc(mpu_, 1), opcode_brk, 3);
+            break;
+
+        case FunctionBuilder::result_jsr_complex:
+            registers.pc = handle_push_and_control_transfer_opcode(
+                mpu_, get_stacked_pc(mpu_, 0) + 1, opcode_jsr, 2);
+            break;
+
+        case FunctionBuilder::result_illegal_instruction:
+        {
+            registers.to_M6502_Registers(mpu_);
+            TRACE("Illegal instruction callback, mpu " << mpu_ << 
+                  ", address 0x" << std::hex << std::setfill('0') << 
+                  std::setw(4) << registers.addr << ", data 0x" << 
+                  std::setw(2) << static_cast<int>(registers.data));
+            uint16_t new_pc = 
+                mpu_->callbacks->illegal_instruction[registers.data](
+                    mpu_, registers.addr, registers.data);
+            TRACE("Callback returned 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << new_pc);
+            registers.from_M6502_Registers(mpu_);
+            mpu_->internal->function_manager_.update_memory_snapshot();
+            if (new_pc != 0)
+            {
+                registers.pc = new_pc;
+            }
+            break;
+        }
+
+        case FunctionBuilder::result_write_to_code:
+            TRACE("Code modified at 0x" << std::hex << std::setfill('0') << 
+                  std::setw(4) << registers.addr);
+            mpu_->internal->function_manager_.code_modified_at(registers.addr);
+            break;
+
+        case FunctionBuilder::result_write_callback:
+        {
+            TRACE("Write callback at 0x" << std::hex << std::setfill('0') <<
+                  std::setw(4) << registers.addr << " with data 0x" << 
+                  std::setw(4) << static_cast<int>(registers.data));
+            // We *don't* invoke Registers.{to,from}_M6502Registers() before
+            // and after the callback. We could do this, but lib6502 itself
+            // (and therefore the lib6502 code used for interpreting in
+            // lib6502-jit) doesn't do that, so this could be confusing
+            // for client code. (For example, a callback might be written
+            // to rely on this, it would work if called from compiled code
+            // but wouldn't work if called from interpreted mode. So its
+            // behaviour in hybrid mode would be random.)
+            (void) mpu_->callbacks->write[registers.addr](
+                mpu_, registers.addr, registers.data);
+            mpu_->internal->function_manager_.update_memory_snapshot();
+            break;
+        }
+
+        case FunctionBuilder::result_invalid_bounds:
+            CANT_HAPPEN("Invalid bounds inside Function for address 0x" <<
+                        std::hex << std::setfill('0') << std::setw(4) <<
+                        address_);
+
+        default:
+            CANT_HAPPEN("Unknown result " << result << " from JIT function");
+    }
+}
+
+#ifdef LOG
+
+namespace
+{
+    std::string indent(int n, const std::string &s)
+    {
+        std::string prefix = spaces(n);
+        return apply_prefix(prefix, s);
+    }
+}
+
+std::string Function::dump_all() const
+{
+    std::stringstream s;
+    s << "Function at 0x" << std::hex << std::setfill('0') << std::setw(4) <<
+         address_ << ":\n";
+    s << spaces(1) << "Code range:\n" << code_range_.dump(2) << "\n";
+    s << spaces(1) << "Optimistic writes at:\n" << optimistic_writes_.dump(2) <<
+         "\n";
+    s << spaces(1) << "6502 machine code:\n" << indent(2, disassembly_) << "\n";
+    s << spaces(1) << "Unoptimised IR:\n" << indent(2, unoptimised_ir_) << "\n";
+    s << spaces(1) << "Optimised IR:\n" << indent(2, optimised_ir_) << "\n";;
+    s << spaces(1) << "Host machine code:\n" << indent(2, dump_machine_code());
+    return s.str();
+}
+
+#endif
+
+namespace
+{
+    template <class Handle, class CloseFnType, CloseFnType close_fn>
+    class AutoClose : boost::noncopyable
+    {
+    public:
+        AutoClose(Handle h)
+        : open_(true), h_(h)
+        {
+        }
+
+        int close()
+        {
+            open_ = false;
+            return close_fn(h_);
+        }
+
+        ~AutoClose()
+        {
+            if (open_)
+            {
+                close_fn(h_); // ignore return code, nothing we can do if it fails
+            }
+        }
+
+    private:
+        bool open_;
+        Handle h_;
+    };
+
+    typedef int (*FdClose)(int);
+    typedef AutoClose<int, FdClose, ::close> FdAutoClose;
+    typedef int (*PopenClose)(FILE *);
+    typedef AutoClose<FILE *, PopenClose, ::pclose> PopenAutoClose;
+}
+
+#ifdef LOG
+
+std::string Function::dump_machine_code() const
+{
+    try
+    {
+        // What a performance! The basic idea of outputting .bytes directives,
+        // assembling those and then disassembling the result is taken from
+        // libjit's dump_object_code(); the implementation is not copied.
+
+        char as_output_file[] = "/tmp/lib6502-jit-XXXXXX";
+
+        errno = 0;
+
+        // mkstemp() creates a unique filename and opens it. We unlink the file
+        // immediately so it has no name; this minimises (but does not
+        // eliminate; we might be killed between mkstemp() and unlink()) the
+        // chance of the file being left lying around. Since we need a name for
+        // the 'as' and 'objdump' commands, we use /dev/fd/nn to refer to it
+        // afterwards.
+        int fd = mkstemp(as_output_file);
+        if (fd == -1)
+        {
+            fail_errno_or("mkstemp() failed");
+        }
+        FdAutoClose auto_close_fd(fd);
+        if (unlink(as_output_file) == -1)
+        {
+            fail_errno_or("unlink() failed");
+        }
+
+        {
+            std::stringstream as_command;
+            as_command << "as -o /dev/fd/" << fd << " 2>/dev/null";
+            FILE *f = popen(as_command.str().c_str(), "w");
+            if (f == 0)
+            {
+                fail_errno_or("popen() failed (for 'as')");
+            }
+            PopenAutoClose auto_close_f(f);
+            unsigned char *p = static_cast<unsigned char *>(mci_.address());              
+            unsigned char *end = p + mci_.size();                                         
+            for (; p < end; ++p)                                                         
+            {                                                                            
+                if (fprintf(f, ".byte %d\n", *p) < 0)
+                {
+                    fail("Error writing to 'as' pipe");
+                }
+            }                                                                            
+            if (auto_close_f.close() != 0)
+            {
+                fail_errno_or("Error closing 'as' pipe");
+            }
+        }
+
+        if (lseek(fd, 0, SEEK_SET) == static_cast<off_t>(-1))
+        {
+            fail_errno_or("Error seeking on temporary file");
+        }
+
+        std::stringstream objdump_command;
+        // As far as I can tell, there's no guarantee how mci_.address() [a
+        // pointer type] will be represented in the stringstream, but in
+        // practice this code is not very portable anyway and this is the least
+        // of our worries...
+        objdump_command << "objdump --adjust-vma=" << 
+                           mci_.address() << " -d /dev/fd/" << fd << " 2>&1";
+        FILE *g = popen(objdump_command.str().c_str(), "r");
+        if (g == 0)
+        {
+            fail_errno_or("popen() failed (for 'objdump')");
+        }
+        PopenAutoClose auto_close_g(g);
+
+        std::stringstream code;
+        char buffer[1024];
+        size_t bytes_read;
+        while ((bytes_read = fread(buffer, 1, sizeof(buffer), g)) > 0)
+        {
+            code << std::string(buffer, bytes_read);
+        }
+        if (ferror(g))
+        {
+            fail("Error reading from 'objdump' pipe");
+        }
+        if (auto_close_g.close() != 0)
+        {
+            fail_errno_or("Error closing 'objdump' pipe");
+        }
+        if (auto_close_fd.close() != 0)
+        {
+            fail_errno_or("Error closing temporary file");
+        }
+
+        return code.str();
+    }
+    catch (std::exception &e)
+    {
+        // Dumping out the generated machine code is decidedly not critical, so
+        // we don't allow the exception to propagate.
+        return std::string("Unable to dump machine code: ") + e.what();
+    }
+}
+
+void Function::fail(const std::string &error) const
+{
+    throw std::runtime_error(error);
+}
+
+void Function::fail_errno_or(const std::string &error) const
+{
+    if (errno == 0)
+    {
+        fail(error);
+    }
+    else
+    {
+        // strerror_r() exists in various versions. If you have problems getting
+        // this to compile, it's probably OK to just use:
+        //     const char *error = strerror(errno);
+        // given a) the limited amount of threading here and b) the fact this is
+        // only used to report rare errors in debug-only logging code. If push
+        // really comes to shove you can just do:
+        //     const char *error = 0;
+        // and you'll just get unhelpful error messages.
+        char buffer[1024];
+        const char *error = strerror_r(errno, buffer, sizeof(buffer));
+        if (error != 0)
+        {
+            fail(error);
+        }
+        else
+        {
+            fail("Error occurred, and strerror() probably failed as well");
+        }
+    }
+}
+
+#endif
diff --git a/Function.h b/Function.h
new file mode 100644
index 0000000..63fd6e8
--- /dev/null
+++ b/Function.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTION_H
+#define FUNCTION_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/IR/Value.h"
+
+#include "AddressSet.h"
+#include "FunctionBuilder.h"
+#include "lib6502.h"
+
+struct LLVMStuff;
+
+class Function : boost::noncopyable
+{
+public:
+    Function(M6502 *mpu, uint16_t address, const AddressSet &code_range, 
+             const AddressSet &optimistic_writes, 
+             llvm::Function *llvm_function);
+    ~Function();
+
+    uint16_t address() const
+    {
+        return address_;
+    }
+
+    const AddressSet &code_range() const
+    {
+        return code_range_;
+    }
+
+    const AddressSet &optimistic_writes() const
+    {
+        return optimistic_writes_;
+    }
+
+    void execute() const
+    {
+        FunctionBuilder::Result result = 
+            static_cast<FunctionBuilder::Result>((*jitted_function_)());
+        if (result != FunctionBuilder::result_control_transfer_direct)
+        {
+            handle_complex_result(result);
+        }
+    }
+
+    #ifdef LOG
+        void set_disassembly(const std::string &s)
+        {
+            disassembly_ = s;
+        }
+
+        void set_unoptimised_ir(const std::string &s)
+        {
+            unoptimised_ir_ = s;
+        }
+
+        void set_optimised_ir(const std::string &s)
+        {
+            optimised_ir_ = s;
+        }
+
+        std::string dump_all() const;
+
+        std::string dump_machine_code() const;
+    #endif
+
+private:
+    void handle_complex_result(FunctionBuilder::Result result) const;
+
+    #ifdef LOG
+        void fail(const std::string &error) const;
+        void fail_errno_or(const std::string &error) const;
+    #endif
+
+    M6502 *mpu_;
+    LLVMStuff &llvm_stuff_;
+    uint16_t address_;
+    AddressSet code_range_;
+    AddressSet optimistic_writes_;
+    llvm::Function *llvm_function_;
+    llvm::MachineCodeInfo mci_;
+
+    typedef int (*JitFunction)();
+    JitFunction jitted_function_;
+    
+    #ifdef LOG
+        std::string disassembly_;
+        std::string unoptimised_ir_;
+        std::string optimised_ir_;
+    #endif
+};
+
+#endif
diff --git a/FunctionBuilder.cpp b/FunctionBuilder.cpp
new file mode 100644
index 0000000..9d7efb8
--- /dev/null
+++ b/FunctionBuilder.cpp
@@ -0,0 +1,3571 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "FunctionBuilder.h"
+
+// Throughout this file we must be careful to avoid incorrect wrap-around
+// handling; for example, it's wrong to do memory[pc + 2] because if pc is
+// 0xffff this will access off the end of memory. We must always use uint16_t
+// intermediate values to get the right wrapping behaviour. Similar
+// considerations apply when using zero-page addressing; we must ensure we wrap
+// around at 0xff.
+
+#include "config.h"
+
+#include <algorithm>
+#include <assert.h>
+#include <iomanip>
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/JIT.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/TypeBuilder.h"
+
+#if defined HAVE_LLVM_ANALYSIS_VERIFIER_H
+    #include "llvm/Analysis/Verifier.h"
+#elif defined HAVE_LLVM_IR_VERIFIER_H
+    #include "llvm/IR/Verifier.h"
+#else
+    #error Need LLVM Verifier.h
+#endif
+
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <sstream>
+
+#include "AddressRange.h"
+#include "const.h"
+#include "Function.h"
+#include "LLVMStuff.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+
+
+namespace llvm
+{
+    template<bool xcompile> 
+    class TypeBuilder<M6502, xcompile>
+    {
+    public:
+        static StructType *get(LLVMContext &context)
+        {
+            static StructType *t = StructType::create(context, "M6502");
+            return t;
+        }
+    };
+
+    template<bool xcompile> 
+    class TypeBuilder<Registers, xcompile>
+    {
+    public:
+        static StructType *get(LLVMContext &context)
+        {
+            static StructType *t = StructType::create("Registers",
+                TypeBuilder<types::i<8>, xcompile>::get(context), // a
+                TypeBuilder<types::i<8>, xcompile>::get(context), // x
+                TypeBuilder<types::i<8>, xcompile>::get(context), // y
+                TypeBuilder<types::i<8>, xcompile>::get(context), // s
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_n
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_v
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_d
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_i
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_z
+                TypeBuilder<JitBool    , xcompile>::get(context), // flag_c
+                TypeBuilder<types::i<16>, xcompile>::get(context), // pc
+                TypeBuilder<types::i<16>, xcompile>::get(context), // addr
+                TypeBuilder<types::i<8>, xcompile>::get(context), // data
+                NULL);
+            return t;
+        }
+    };
+}
+
+namespace
+{
+    const std::string hex_prefix = "&";
+
+    bool callback_in_bounds(const M6502_Callback *callbacks, 
+                            const AddressRange &bounds)
+    {
+        for (AddressRange::const_iterator it = bounds.begin(); 
+             it != bounds.end(); ++it)
+        {
+            if (callbacks[*it] != 0)
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+}
+
+
+
+// BoundedAddress contains an llvm::Value of type i16 which refers to
+// an address in the emulated memory. It additionally contains a range of
+// possible addresses which the llvm::Value can evaluate to (derived from the
+// addressing mode which created it). This is used to optimise the generated
+// code.
+
+class FunctionBuilder::BoundedAddress
+{
+public:
+    // Construct a BoundedAddress with the widest possible bounds; this
+    // is always safe, but if possible should be avoided as it reduces
+    // optimisation potential.
+    BoundedAddress(FunctionBuilder &fb, llvm::Value *addr);
+
+    // Construct a BoundedAddress with the given bounds.
+    BoundedAddress(FunctionBuilder &fb, llvm::Value *addr, 
+                   const AddressRange &bounds);
+
+    llvm::Value *addr() const
+    {
+        return addr_;
+    }
+
+    const AddressRange &bounds() const
+    {
+        return bounds_;
+    }
+
+    friend
+    std::ostream &operator<<(std::ostream &s, const BoundedAddress &ba)
+    {
+        std::stringstream t;
+        t << "[0x" << std::hex << std::setfill('0') << std::setw(4) << 
+             ba.bounds().range_begin() << ", 0x" << std::setw(4) << 
+             ba.bounds().range_end() << ")";
+        s << t.str();
+        return s;
+    }
+
+private:
+    llvm::Value *addr_;
+    AddressRange bounds_;
+};
+
+FunctionBuilder::BoundedAddress::BoundedAddress(
+  FunctionBuilder &fb, llvm::Value *addr)
+: addr_(addr), bounds_(0, memory_size)
+{
+    assert(addr->getType() == fb.i16_type_);
+}
+
+FunctionBuilder::BoundedAddress::BoundedAddress(
+    FunctionBuilder &fb, llvm::Value *addr, const AddressRange &bounds)
+: addr_(addr), bounds_(bounds)
+{
+    assert(addr->getType() == fb.i16_type_);
+
+#ifndef NDEBUG
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(addr);
+    if (addr_ci != 0)
+    {
+        // We can verify the claimed bounds at compile time.
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        assert(addr16 == bounds.range_begin());
+        assert(addr16 == (bounds.range_end() - 1));
+    }
+    else
+    {
+        // We can't verify the claimed bounds at compile time, so generate code 
+        // to check at runtime.
+
+        llvm::BasicBlock *bounds_maybe_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_maybe_ok_block", 
+                                     fb.llvm_function_);
+        llvm::BasicBlock *bounds_not_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_not_ok");
+        llvm::BasicBlock *bounds_ok_block = 
+            llvm::BasicBlock::Create(fb.context_, "bounds_ok");
+
+        if (bounds.range_end() <= memory_size)
+        {
+            TRACE("Generating bounds check code for non-wrapped case");
+            llvm::Value *lower_bound_ok = 
+                fb.builder_.CreateICmpUGE(
+                    addr, fb.constant_u16(bounds.range_begin()));
+            fb.builder_.CreateCondBr(lower_bound_ok, bounds_maybe_ok_block, 
+                                     bounds_not_ok_block);
+            fb.builder_.SetInsertPoint(bounds_maybe_ok_block);
+            llvm::Value *upper_bound_ok = 
+                fb.builder_.CreateICmpULE(
+                    addr, fb.constant_u16(bounds.range_end() - 1));
+            fb.builder_.CreateCondBr(upper_bound_ok, bounds_ok_block, 
+                                     bounds_not_ok_block);
+        }
+        else
+        {
+            TRACE("Generating bounds check code for wrapped case");
+            llvm::Value *in_upper_range = 
+                fb.builder_.CreateICmpUGE(
+                    addr, fb.constant_u16(bounds.range_begin()));
+            fb.builder_.CreateCondBr(in_upper_range, bounds_ok_block, 
+                                     bounds_maybe_ok_block);
+            fb.builder_.SetInsertPoint(bounds_maybe_ok_block);
+            // We want to truncate bounds.range_end() - 1 to 16 bits here.
+            llvm::Value *in_lower_range = 
+                fb.builder_.CreateICmpULE(
+                    addr, fb.constant_u16(bounds.range_end() - 1));
+            fb.builder_.CreateCondBr(in_lower_range, bounds_ok_block, 
+                                     bounds_not_ok_block);
+        }
+
+        fb.llvm_function_->getBasicBlockList().push_back(bounds_not_ok_block);
+        fb.builder_.SetInsertPoint(bounds_not_ok_block);
+        fb.return_invalid_bounds();
+
+        fb.llvm_function_->getBasicBlockList().push_back(bounds_ok_block);
+        fb.builder_.SetInsertPoint(bounds_ok_block);
+    }
+#endif
+}
+
+
+
+FunctionBuilder::FunctionBuilder(
+    M6502 *mpu, const uint8_t *ct_memory, JitBool *code_at_address, 
+    uint16_t address)
+: built_(false),
+  mpu_(mpu),
+  code_at_address_(code_at_address),
+  address_(address),
+  ct_memory_(ct_memory),
+  callbacks_(*(mpu->callbacks)),
+  instructions_(0),
+  max_instructions_(std::max(1, mpu->internal->max_instructions_)),
+  context_(llvm::getGlobalContext()),
+  native_int_type_(llvm::TypeBuilder<int, false>::get(context_)),
+  callback_type_(llvm::TypeBuilder<M6502_Callback, false>::get(context_)),
+  i1_type_(llvm::TypeBuilder<llvm::types::i<1>, false>::get(context_)),
+  i8_type_(llvm::TypeBuilder<llvm::types::i<8>, false>::get(context_)),
+  i16_type_(llvm::TypeBuilder<llvm::types::i<16>, false>::get(context_)),
+  i32_type_(llvm::TypeBuilder<llvm::types::i<32>, false>::get(context_)),
+  i64_type_(llvm::TypeBuilder<llvm::types::i<64>, false>::get(context_)),
+  jit_bool_type_(llvm::TypeBuilder<JitBool, false>::get(context_)),
+  builder_(mpu_->internal->llvm_stuff_.builder_),
+  address_block_(),
+  code_generated_for_address_()
+{
+    llvm::FunctionType *ft = llvm::TypeBuilder<int(), false>::get(context_);
+    std::stringstream name;
+    name << "x" << std::hex << std::setw(4) << std::setfill('0') << address_;
+    llvm_function_ = llvm::Function::Create(
+        ft, llvm::Function::PrivateLinkage, name.str(), 
+        mpu_->internal->llvm_stuff_.module_.get());
+
+    llvm::BasicBlock *BB = 
+        llvm::BasicBlock::Create(context_, "prologue", llvm_function_);
+    builder_.SetInsertPoint(BB);
+
+    mpu_llvm_ = constant_ptr(mpu, "mpu");
+    code_at_address_llvm_ = constant_ptr(code_at_address, "code_at_address");
+    registers_ = constant_ptr(&(mpu->internal->registers_), "registers");
+    read_callbacks_ = constant_ptr(callbacks_.read, "read_callbacks");
+    write_callbacks_ = constant_ptr(callbacks_.write, "write_callbacks");
+    call_callbacks_ = constant_ptr(callbacks_.call, "call_callbacks");
+    memory_base_ = constant_ptr(mpu->memory, "memory");
+
+    function_result_ = 
+        builder_.CreateAlloca(native_int_type_, 0, "function_result");
+
+    // Function prologue: Copy the registers from Registers into local
+    // variables for use. The epilogue will reverse this process before the
+    // function returns for registers which actually get modified. (The
+    // LLVM optimiser is then able to remove loads which would just load
+    // unused values.)
+    initialise_i8_reg(a_     , 0, "a");
+    initialise_i8_reg(x_     , 1, "x");
+    initialise_i8_reg(y_     , 2, "y");
+    initialise_i8_reg(s_     , 3, "s");
+    initialise_jb_reg(flag_n_, 4, "flag_n");
+    initialise_jb_reg(flag_v_, 5, "flag_v");
+    initialise_jb_reg(flag_d_, 6, "flag_d");
+    initialise_jb_reg(flag_i_, 7, "flag_i");
+    initialise_jb_reg(flag_z_, 8, "flag_z");
+    initialise_jb_reg(flag_c_, 9, "flag_c");
+
+    pc_     = builder_.CreateAlloca(i16_type_, 0, "pc");
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, 10), false, "pc"), 
+        pc_);
+
+    // Temporary variable used when invoking read callbacks; no need to
+    // initialise.
+    read_callback_result_ = 
+        builder_.CreateAlloca(i8_type_, 0, "read_callback_result");
+
+    // Temporary variables for ADC/SBC implementation; no need to initialise.
+    p_tmp_ = builder_.CreateAlloca(i8_type_, 0, "p_tmp");
+    l_tmp_ = builder_.CreateAlloca(i8_type_, 0, "l_tmp");
+    s_tmp_ = builder_.CreateAlloca(i16_type_, 0, "s_tmp");
+    t_tmp_ = builder_.CreateAlloca(i16_type_, 0, "t_tmp");
+
+    epilogue_ = llvm::BasicBlock::Create(context_, "epilogue");
+}
+
+// The Register objects are initialised using these functions instead of
+// constructors mainly because we need a builder_ with an associated BasicBlock
+// to initialise a Register, and we don't have that when the FunctionBuilder
+// object is first constructed.
+
+void FunctionBuilder::initialise_i8_reg(
+    Register &r, int structure_index, const std::string &name)
+{
+    llvm::Value *v = builder_.CreateAlloca(i8_type_, 0, name);
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, structure_index), false, name), 
+        v);
+    r.v_ = v;
+    r.modified_ = false;
+}
+
+void FunctionBuilder::initialise_jb_reg(
+    Register &r, int structure_index, const std::string &name)
+{
+    llvm::Value *v = builder_.CreateAlloca(jit_bool_type_, 0, name);
+    builder_.CreateStore(
+        builder_.CreateLoad(
+            builder_.CreateStructGEP(registers_, structure_index), false, name), 
+        v);
+    r.v_ = v;
+    r.modified_ = false;
+}
+
+void FunctionBuilder::ensure_address_block_created(uint16_t addr)
+{
+    if (address_block_[addr] == 0)
+    {
+        std::stringstream s;
+        s << "l" << std::hex << std::setw(4) << std::setfill('0') << addr;
+        address_block_[addr] = 
+            llvm::BasicBlock::Create(context_, s.str(), llvm_function_);
+    }
+}
+
+boost::shared_ptr<Function> FunctionBuilder::build()
+{
+    // This can't be invoked twice on the same FunctionBuilder object;
+    // at present, for example, attempts to insert into 'epilogue_' crash
+    // (presumably because it's been used to generate code already). There
+    // is no reason to do this and I'm not going to convolute things to make
+    // this pointless case work. Even asserting that this doesn't happen
+    // seems like overkill, but let's do it anyway.
+    assert(!built_);
+
+    // While it doesn't strictly matter, the fact that pending_ is a std::set
+    // means it will internally sort the addresses. This makes it more likely
+    // that multiple backward jumps will only result in one stretch of code
+    // being produced, since the furthest jump backwards will be JITted first.
+    pending_.insert(address_);
+    while (!pending_.empty())
+    {
+        // We take addresses to JIT at from pending_ to start with, and when
+        // there's no "better" address...
+        uint16_t ct_pc = *(pending_.begin());
+
+        // ... but if we can continue JITting where we left off, we prefer
+        // to do that. Since each block of code emitted by build_at() is
+        // independent, this doesn't alter the behaviour of the generated
+        // code, but it avoids gratuitous discontinuities in the generated
+        // code compared with the source machine code.
+        do
+        {
+            pending_.erase(ct_pc);
+            uint16_t new_ct_pc = build_at(ct_pc);
+            if (new_ct_pc == ct_pc)
+            {
+                // build_at() did no work.
+            }
+            else if (new_ct_pc > ct_pc)
+            {
+                code_range_.insert(AddressRange(ct_pc, new_ct_pc));
+            }
+            else
+            {
+                // PC wrapped around during the translation.
+                uint32_t range_end = new_ct_pc;
+                range_end += memory_size;
+                code_range_.insert(AddressRange(ct_pc, range_end));
+            }
+            ct_pc = new_ct_pc;
+        }
+        while (pending_.find(ct_pc) != pending_.end());
+    }
+
+    LLVMStuff &llvm_stuff = mpu_->internal->llvm_stuff_;
+    llvm::FunctionPassManager fpm(llvm_stuff.module_.get());
+
+#ifdef HAVE_LLVM_DATA_LAYOUT_PASS
+    fpm.add(new llvm::DataLayoutPass(llvm_stuff.module_.get()));
+#else
+    fpm.add(
+        new llvm::DataLayout(*llvm_stuff.execution_engine_->getDataLayout()));
+#endif
+    fpm.add(llvm::createBasicAliasAnalysisPass());
+    fpm.add(llvm::createPromoteMemoryToRegisterPass());
+    fpm.add(llvm::createInstructionCombiningPass());
+    fpm.add(llvm::createReassociatePass());
+    fpm.add(llvm::createGVNPass());
+    fpm.add(llvm::createCFGSimplificationPass());
+    fpm.doInitialization();
+
+    // We could have passed llvm_function_ to BasicBlock::Create() earlier
+    // and then we wouldn't need to do this push_back() here, but doing
+    // this means the epilogue appears at the end of the IR. It makes no
+    // functional difference but it seems slightly more logical to read.
+    llvm_function_->getBasicBlockList().push_back(epilogue_);
+
+    builder_.SetInsertPoint(epilogue_);
+    if (a_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(a_.v_), 
+            builder_.CreateStructGEP(registers_, 0));
+    }
+    if (x_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(x_.v_), 
+            builder_.CreateStructGEP(registers_, 1));
+    }
+    if (y_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(y_.v_), 
+            builder_.CreateStructGEP(registers_, 2));
+    }
+    if (s_.modified_)
+    {
+        builder_.CreateStore(
+            builder_.CreateLoad(s_.v_), 
+            builder_.CreateStructGEP(registers_, 3));
+    }
+    if (flag_n_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_n_), 
+            builder_.CreateStructGEP(registers_, 4));
+    }
+    if (flag_v_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_v_), 
+            builder_.CreateStructGEP(registers_, 5));
+    }
+    if (flag_d_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_d_), 
+            builder_.CreateStructGEP(registers_, 6));
+    }
+    if (flag_i_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_i_), 
+            builder_.CreateStructGEP(registers_, 7));
+    }
+    if (flag_z_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_z_), 
+            builder_.CreateStructGEP(registers_, 8));
+    }
+    if (flag_c_.modified_)
+    {
+        builder_.CreateStore(
+            register_load(flag_c_), 
+            builder_.CreateStructGEP(registers_, 9));
+    }
+    builder_.CreateStore(
+        builder_.CreateLoad(pc_), 
+        builder_.CreateStructGEP(registers_, 10));
+
+    builder_.CreateRet(builder_.CreateLoad(function_result_));
+
+    #ifdef LOG
+        std::string unoptimised_ir;
+        {
+            llvm::raw_string_ostream s(unoptimised_ir);
+            llvm_function_->print(s);
+            s.str();
+        }
+    #endif
+    llvm::verifyFunction(*llvm_function_);
+
+    fpm.run(*llvm_function_);
+    #ifdef LOG
+        std::string optimised_ir;
+        {
+            llvm::raw_string_ostream s(optimised_ir);
+            llvm_function_->print(s);
+            s.str();
+        }
+    #endif
+
+    boost::shared_ptr<Function> f(
+        new Function(mpu_, address_, code_range_, optimistic_writes_, 
+                     llvm_function_));
+    #ifdef LOG
+        f->set_disassembly(disassembly_.str());
+        f->set_unoptimised_ir(unoptimised_ir);
+        f->set_optimised_ir(optimised_ir);
+    #endif
+
+    built_ = true;
+    return f;
+}
+
+// This translates a linear stream of 6502 instructions into LLVM IR. The
+// generation stops either when we've translated enough 6502 instructions
+// or when we hit an instruction which unconditionally transfers control
+// elsewhere. Branch targets found during the translation are added to pending_
+// for further consideration; at a minimum, address_block[] entries with
+// associated code to transfer control to those addresses must be generated
+// for each of these before terminating the build process for the function.
+//
+// The address of the first byte not translated is returned.
+uint16_t FunctionBuilder::build_at(uint16_t ct_pc)
+{
+    TRACE("Translating linear stream of instructions at 0x" << std::hex <<
+          std::setfill('0') << std::setw(4) << ct_pc);
+
+    const uint16_t original_ct_pc = ct_pc;
+    // If we already translated this stretch of code, we don't need to do
+    // anything at all.
+    if (code_generated_for_address_[ct_pc])
+    {
+        TRACE("Already translated this linear stream");
+        return ct_pc;
+    }
+
+    while (true)
+    {
+        TRACE("Translating at 0x" << std::hex << std::setfill('0') << 
+              std::setw(4) << ct_pc << ", opcode 0x" << std::setw(2) <<
+              static_cast<int>(ct_memory_[ct_pc]));
+
+        const uint16_t this_opcode_at = ct_pc;
+
+        if (code_generated_for_address_[ct_pc])
+        {
+            // We already translated this instruction, so we can stop
+            // translating and just jump there. Since this is just linear
+            // flow of control from the perspective of the 6502 code, this
+            // cannot trigger a call callback.
+            TRACE("Already translated this instruction");
+            if (builder_.GetInsertBlock()->getTerminator() == 0)
+            {
+                control_transfer_to(constant_u16(ct_pc), opcode_implicit);
+            }
+            break;
+        }
+
+        // Each instruction forms its own basic block (since we build up the
+        // IR as we go, we can't know where we might want to branch into,
+        // so we cannot merge multiple instructions into a single basic
+        // block). Basic blocks must end with a terminator, so if there isn't
+        // already a terminator at the end of the previous instruction's basic
+        // block, we insert an unconditional branch to this instruction's
+        // basic block. If there is already a terminator, we stop translating
+        // this stream of instructions unless this is the first instruction
+        // in this linear sequence; this way we avoid generating unreachable
+        // code if the previous instruction (for example) returned some kind
+        // of status code to our caller. (If the following instruction is
+        // reachable in some other way, it will be translated separately -
+        // as the first instruction in a linear sequence - because it will
+        // be present in pending.)
+        bool insert_block_has_terminator = 
+            (builder_.GetInsertBlock()->getTerminator() != 0);
+        if (insert_block_has_terminator && (ct_pc != original_ct_pc))
+        {
+            TRACE("Not translating as not first instruction in linear stream "
+                  "and previous instruction's basic block has a terminator");
+            break;
+        }
+        ensure_address_block_created(ct_pc);
+        if (!insert_block_has_terminator)
+        {
+            builder_.CreateBr(address_block_[ct_pc]);
+        }
+        builder_.SetInsertPoint(address_block_[ct_pc]);
+
+        // Note that we only set this flag for the opcode byte, not the
+        // whole length of the instruction. Apart from being easiest,
+        // this is actually correct. Someone might do LDA #<opcode for
+        // LDA #>:STA <opcode for RTS> or something weird like that and
+        // interleave instructions.
+        code_generated_for_address_[ct_pc] = true;
+
+        if (instructions_ >= max_instructions_)
+        {
+            TRACE("Translated maximum number of instructions");
+            // We must *not* use control_transfer_to() here; it would see
+            // that we have set code_generated_for_address_ and generate a
+            // branch to here, i.e. an infinite loop. It is correct that we
+            // have set code_generated_for_address_ since we must set that
+            // if we generate a corresponding address_block entry and we must
+            // do that so that any branches to this address can be resolved.
+            return_control_transfer_direct(constant_u16(ct_pc));
+            break;
+        }
+        ++instructions_;
+
+        uint8_t opcode = ct_memory_[ct_pc];
+        if (opcode == opcode_brk)
+        {
+            disassemble1(ct_pc, "BRK");
+
+            llvm::Value *new_pc_low = memory_read(abs(0xfffe));
+            llvm::Value *new_pc_high = memory_read(abs(0xffff));
+            llvm::Value *new_pc = create_u16(new_pc_low, new_pc_high);
+
+            // Because BRK pushes three bytes onto the stack, we devolve
+            // responsibility for checking for code living on the stack
+            // being modified to our caller (by returning result_brk), so
+            // we use push*raw() here. (We don't support optimistic writes;
+            // BRK isn't performance critical so there's no payoff for the
+            // extra complexity.)
+ 
+            uint16_t pc_to_stack = this_opcode_at + 2;
+            push_u16_raw(pc_to_stack);
+
+            llvm::Value *p = flag_byte();
+            p = builder_.CreateOr(p, constant_u8(flagB | flagX));
+            push_u8_raw(p);
+
+            register_store(constant_jb(jit_bool_true), flag_i_);
+            register_store(constant_jb(jit_bool_false), flag_d_);
+
+            return_brk(new_pc);
+        }
+        else if (opcode == 0x01)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, ",X)");
+            ora(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x02)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x03)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x04)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "TSB ", operand);
+            memory_op(&FunctionBuilder::tsb, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x05)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA ", operand);
+            ora(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x06)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ASL ", operand);
+            memory_op(&FunctionBuilder::asl, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x07)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x08)
+        {
+            disassemble1(ct_pc, "PHP");
+
+            llvm::Value *p = flag_byte();
+            p = builder_.CreateOr(p, constant_u8(flagB | flagX));
+            push_u8(p, ct_pc);
+        }
+        else if (opcode == 0x09)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA #", operand);
+            ora(constant_u8(operand));
+        }
+        else if (opcode == 0x0a)
+        {
+            disassemble1(ct_pc, "ASL A");
+            register_op(&FunctionBuilder::asl, a_);
+        }
+        else if (opcode == 0x0b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x0c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "TSB ", operand);
+            memory_op(&FunctionBuilder::tsb, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x0d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand);
+            ora(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x0e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ASL ", operand);
+            memory_op(&FunctionBuilder::asl, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x0f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bpl)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BPL ", target);
+            pending_.insert(target);
+            branch(flag_n_, false, target);
+        }
+        else if (opcode == 0x11)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, "),Y");
+            ora(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x12)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA (", operand, ")");
+            ora(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x13)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x14)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "TRB ", operand);
+            memory_op(&FunctionBuilder::trb, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x15)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ORA ", operand, ",X");
+            ora(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x16)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ASL ", operand, ",X");
+            memory_op(&FunctionBuilder::asl, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x17)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x18)
+        {
+            disassemble1(ct_pc, "CLC");
+            register_store(constant_jb(jit_bool_false), flag_c_);
+        }
+        else if (opcode == 0x19)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand, ",Y");
+            ora(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x1a)
+        {
+            disassemble1(ct_pc, "INC A");
+            register_op(&FunctionBuilder::inc, a_);
+        }
+        else if (opcode == 0x1b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x1c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "TRB ", operand);
+            memory_op(&FunctionBuilder::trb, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x1d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ORA ", operand, ",X");
+            ora(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x1e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ASL ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::asl, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x1f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jsr)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JSR ", operand);
+            uint16_t mangled_return_addr = ct_pc - 1;
+
+            // We are pushing two bytes onto the stack here and possibly
+            // requiring our caller to handle the control transfer, so the
+            // standard mechanisms for handling writes to code and control
+            // transfer aren't enough. control_transfer_to() contains special
+            // logic for JSR and we just use push_u16_raw() here.
+            push_u16_raw(mangled_return_addr);
+
+            // We generally want to translate the subroutine code into
+            // this function, so control_transfer_to() can perform the
+            // control transfer with a simple branch. However, if there is
+            // a call callback, control_transfer_to() will have to arrange
+            // a control transfer via the generated function's caller. It
+            // would be strictly harmless for us to translate the subroutine
+            // code anyway, as it will just never be executed, but it is
+            // both pointless and makes the generated IR less readable (it
+            // has a superficially buggy appearance, since it will show a
+            // translation of possibly junk code at the callback address
+            // which may never actually execute).
+            bool is_call_callback = (callbacks_.call[operand] != 0);
+            if (!is_call_callback)
+            {
+                pending_.insert(operand);
+
+                // We can predict that the RTS in the subroutine we are
+                // about to call will return to the immediately following
+                // instruction.  (This is not guaranteed; the subroutine
+                // might fiddle with the stack. If that happens the "code"
+                // at ct_pc might be junk, but that's an acceptable risk;
+                // we will translate it but it will never be executed, and
+                // any stream of bytes can be translated even if the code
+                // is nonsense.)
+                pending_.insert(ct_pc);
+                predicted_rts_targets_[operand].insert(ct_pc);
+            }
+
+            control_transfer_to(constant_u16(operand), opcode);
+        }
+        else if (opcode == 0x21)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, ",X)");
+            And(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x22)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x23)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x24)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT ", operand);
+            bit(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x25)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND ", operand);
+            And(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x26)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROL ", operand);
+            memory_op(&FunctionBuilder::rol, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x27)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x28)
+        {
+            disassemble1(ct_pc, "PLP");
+            pop_flags();
+        }
+        else if (opcode == 0x29)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND #", operand);
+            And(constant_u8(operand));
+        }
+        else if (opcode == 0x2a)
+        {
+            disassemble1(ct_pc, "ROL A");
+            register_op(&FunctionBuilder::rol, a_);
+        }
+        else if (opcode == 0x2b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x2c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "BIT ", operand);
+            bit(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x2d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand);
+            And(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x2e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROL ", operand);
+            memory_op(&FunctionBuilder::rol, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x2f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bmi)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BMI ", target);
+            pending_.insert(target);
+            branch(flag_n_, true, target);
+        }
+        else if (opcode == 0x31)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, "),Y");
+            And(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x32)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND (", operand, ")");
+            And(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x33)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x34)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT ", operand, ",X");
+            bit(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x35)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "AND ", operand, ",X");
+            And(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x36)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROL ", operand, ",X");
+            memory_op(&FunctionBuilder::rol, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x37)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x38)
+        {
+            disassemble1(ct_pc, "SEC");
+            register_store(constant_jb(jit_bool_true), flag_c_);
+        }
+        else if (opcode == 0x39)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand, ",Y");
+            And(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x3a)
+        {
+            disassemble1(ct_pc, "DEC A");
+            register_op(&FunctionBuilder::dec, a_);
+        }
+        else if (opcode == 0x3b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x3c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "BIT ", operand, ",X");
+            bit(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x3d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "AND ", operand, ",X");
+            And(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x3e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROL ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::rol, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x3f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_rti)
+        {
+            disassemble1(ct_pc, "RTI");
+            pop_flags();
+            llvm::Value *new_pc = pop_u16();
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x41)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, ",X)");
+            eor(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x42)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x43)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x44)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x45)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR ", operand);
+            eor(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x46)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LSR ", operand);
+            memory_op(&FunctionBuilder::lsr, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x47)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x48)
+        {
+            disassemble1(ct_pc, "PHA");
+            push_u8(register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x49)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR #", operand);
+            eor(constant_u8(operand));
+        }
+        else if (opcode == 0x4a)
+        {
+            disassemble1(ct_pc, "LSR A");
+            register_op(&FunctionBuilder::lsr, a_);
+        }
+        else if (opcode == 0x4b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP ", operand);
+            pending_.insert(operand);
+            control_transfer_to(constant_u16(operand), opcode);
+        }
+        else if (opcode == 0x4d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand);
+            eor(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x4e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LSR ", operand);
+            memory_op(&FunctionBuilder::lsr, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x4f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bvc)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BVC ", target);
+            pending_.insert(target);
+            branch(flag_v_, false, target);
+        }
+        else if (opcode == 0x51)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, "),Y");
+            eor(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x52)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR (", operand, ")");
+            eor(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x53)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x54)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x55)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "EOR ", operand, ",X");
+            eor(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x56)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LSR ", operand, ",X");
+            memory_op(&FunctionBuilder::lsr, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x57)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x58)
+        {
+            disassemble1(ct_pc, "CLI");
+            register_store(constant_jb(jit_bool_false), flag_i_);
+        }
+        else if (opcode == 0x59)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand, ",Y");
+            eor(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x5a)
+        {
+            disassemble1(ct_pc, "PHY");
+            push_u8(register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x5b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x5c)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0x5d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "EOR ", operand, ",X");
+            eor(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x5e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LSR ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::lsr, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x5f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_rts)
+        {
+            disassemble1(ct_pc, "RTS");
+            llvm::Value *new_pc = check_predicted_rts(original_ct_pc);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x61)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, ",X)");
+            adc(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x62)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x63)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x64)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STZ ", operand);
+            memory_write(zp(operand), constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x65)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC ", operand);
+            adc(memory_read(zp(operand)));
+        }
+        else if (opcode == 0x66)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROR ", operand);
+            memory_op(&FunctionBuilder::ror, zp(operand), ct_pc);
+        }
+        else if (opcode == 0x67)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x68)
+        {
+            disassemble1(ct_pc, "PLA");
+            llvm::Value *data = pop_u8();
+            register_store(data, a_);
+            set_nz(data);
+        }
+        else if (opcode == 0x69)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC #", operand);
+            adc(constant_u8(operand));
+        }
+        else if (opcode == 0x6a)
+        {
+            disassemble1(ct_pc, "ROR A");
+            register_op(&FunctionBuilder::ror, a_);
+        }
+        else if (opcode == 0x6b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_ind_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP (", operand, ")");
+            llvm::Value *low_byte = memory_read_untrapped(abs(operand));
+            // We're emulating the 65C02 here so we don't wrap if operand
+            // is of the form &xxFF. (Unless xx is FF, of course.)
+            uint16_t high_byte_at = operand + 1;
+            llvm::Value *high_byte = memory_read_untrapped(abs(high_byte_at));
+            llvm::Value *new_pc = create_u16(low_byte, high_byte);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x6d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand);
+            adc(memory_read(abs(operand)));
+        }
+        else if (opcode == 0x6e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROR ", operand);
+            memory_op(&FunctionBuilder::ror, abs(operand), ct_pc);
+        }
+        else if (opcode == 0x6f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bvs)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BVS ", target);
+            pending_.insert(target);
+            branch(flag_v_, true, target);
+        }
+        else if (opcode == 0x71)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, "),Y");
+            adc(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0x72)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC (", operand, ")");
+            adc(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0x73)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x74)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STZ ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x75)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ADC ", operand, ",X");
+            adc(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0x76)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "ROR ", operand, ",X");
+            memory_op(&FunctionBuilder::ror, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0x77)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x78)
+        {
+            disassemble1(ct_pc, "SEI");
+            register_store(constant_jb(jit_bool_true), flag_i_);
+        }
+        else if (opcode == 0x79)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand, ",Y");
+            adc(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0x7a)
+        {
+            disassemble1(ct_pc, "PLY");
+            llvm::Value *data = pop_u8();
+            register_store(data, y_);
+            set_nz(data);
+        }
+        else if (opcode == 0x7b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_jmp_indx_abs)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "JMP (", operand, ",X)");
+            llvm::Value *low_byte_at = 
+                builder_.CreateAdd(
+                    constant_u16(operand), 
+                    zext_i16(register_load(x_)));
+            llvm::Value *high_byte_at = 
+                builder_.CreateAdd(low_byte_at, constant_u16(1));
+            llvm::Value *low_byte = 
+                memory_read_untrapped(BoundedAddress(*this, low_byte_at));
+            llvm::Value *high_byte = 
+                memory_read_untrapped(BoundedAddress(*this, high_byte_at));
+            llvm::Value *new_pc = create_u16(low_byte, high_byte);
+            control_transfer_to(new_pc, opcode);
+        }
+        else if (opcode == 0x7d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ADC ", operand, ",X");
+            adc(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0x7e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "ROR ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::ror, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0x7f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bra)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BRA ", target);
+            pending_.insert(target);
+            control_transfer_to(constant_u16(target), opcode);
+        }
+        else if (opcode == 0x81)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, ",X)");
+            memory_write(zp_pre_index(constant_u8(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x82)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0x83)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x84)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STY ", operand);
+            memory_write(zp(operand), register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x85)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA ", operand);
+            memory_write(zp(operand), register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x86)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STX ", operand);
+            memory_write(zp(operand), register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x87)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x88)
+        {
+            disassemble1(ct_pc, "DEY");
+            register_op(&FunctionBuilder::dec, y_);
+        }
+        else if (opcode == 0x89)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "BIT #", operand);
+            // Note that unlike other BIT opcodes, this one only affects
+            // the Z flag.
+            llvm::Value *tmp = 
+                builder_.CreateAnd(register_load(a_), constant_u8(operand));
+            set_z(tmp);
+        }
+        else if (opcode == 0x8a)
+        {
+            disassemble1(ct_pc, "TXA");
+            transfer(x_, a_);
+        }
+        else if (opcode == 0x8b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x8c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STY ", operand);
+            memory_write(abs(operand), register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x8d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand);
+            memory_write(abs(operand), register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x8e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STX ", operand);
+            memory_write(abs(operand), register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x8f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bcc)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BCC ", target);
+            pending_.insert(target);
+            branch(flag_c_, false, target);
+        }
+        else if (opcode == 0x91)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, "),Y");
+            memory_write(zp_post_index(constant_u8(operand), register_load(y_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x92)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA (", operand, ")");
+            memory_write(zp_post_index(constant_u8(operand), constant_u8(0)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x93)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x94)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STY ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         register_load(y_), ct_pc);
+        }
+        else if (opcode == 0x95)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STA ", operand, ",X");
+            memory_write(zp_index(constant_u8(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x96)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "STX ", operand, ",Y");
+            memory_write(zp_index(constant_u8(operand), register_load(y_)), 
+                         register_load(x_), ct_pc);
+        }
+        else if (opcode == 0x97)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x98)
+        {
+            disassemble1(ct_pc, "TYA");
+            transfer(y_, a_);
+        }
+        else if (opcode == 0x99)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand, ",Y");
+            memory_write(abs_index(constant_u16(operand), register_load(y_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x9a)
+        {
+            disassemble1(ct_pc, "TXS");
+            // We don't use transfer() even though we do for TSX; TXS doesn't
+            // set any flags.
+            register_store(register_load(x_), s_);
+        }
+        else if (opcode == 0x9b)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0x9c)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STZ ", operand);
+            memory_write(abs(operand), constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x9d)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STA ", operand, ",X");
+            memory_write(abs_index(constant_u16(operand), register_load(x_)), 
+                         register_load(a_), ct_pc);
+        }
+        else if (opcode == 0x9e)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "STZ ", operand, ",X");
+            memory_write(abs_index(constant_u16(operand), register_load(x_)), 
+                         constant_u8(0), ct_pc);
+        }
+        else if (opcode == 0x9f)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY #", operand);
+            ld(y_, constant_u8(operand));
+        }
+        else if (opcode == 0xa1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, ",X)");
+            ld(a_, memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xa2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX #", operand);
+            ld(x_, constant_u8(operand));
+        }
+        else if (opcode == 0xa3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY ", operand);
+            ld(y_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA ", operand);
+            ld(a_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX ", operand);
+            ld(x_, memory_read(zp(operand)));
+        }
+        else if (opcode == 0xa7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xa8)
+        {
+            disassemble1(ct_pc, "TAY");
+            transfer(a_, y_);
+        }
+        else if (opcode == 0xa9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA #", operand);
+            ld(a_, constant_u8(operand));
+        }
+        else if (opcode == 0xaa)
+        {
+            disassemble1(ct_pc, "TAX");
+            transfer(a_, x_);
+        }
+        else if (opcode == 0xab)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xac)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDY ", operand);
+            ld(y_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xad)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand);
+            ld(a_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xae)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDX ", operand);
+            ld(x_, memory_read(abs(operand)));
+        }
+        else if (opcode == 0xaf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bcs)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BCS ", target);
+            pending_.insert(target);
+            branch(flag_c_, true, target);
+        }
+        else if (opcode == 0xb1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, "),Y");
+            ld(a_, memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xb2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA (", operand, ")");
+            ld(a_, memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0xb3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xb4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDY ", operand, ",X");
+            ld(y_, memory_read(
+                zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xb5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDA ", operand, ",X");
+            ld(a_, memory_read(
+                zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xb6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "LDX ", operand, ",Y");
+            ld(x_, memory_read(
+                zp_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xb7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xb8)
+        {
+            disassemble1(ct_pc, "CLV");
+            register_store(constant_jb(jit_bool_false), flag_v_);
+        }
+        else if (opcode == 0xb9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand, ",Y");
+            ld(a_, memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xba)
+        {
+            disassemble1(ct_pc, "TSX");
+            transfer(s_, x_);
+        }
+        else if (opcode == 0xbb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xbc)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDY ", operand, ",X");
+            ld(y_, memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xbd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDA ", operand, ",X");
+            ld(a_, memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xbe)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "LDX ", operand, ",Y");
+            ld(x_, memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xbf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPY #", operand);
+            cmp(register_load(y_), constant_u8(operand));
+        }
+        else if (opcode == 0xc1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, ",X)");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xc2)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xc3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPY ", operand);
+            cmp(register_load(y_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xc5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP ", operand);
+            cmp(register_load(a_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xc6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "DEC ", operand);
+            memory_op(&FunctionBuilder::dec, zp(operand), ct_pc);
+        }
+        else if (opcode == 0xc7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xc8)
+        {
+            disassemble1(ct_pc, "INY");
+            register_op(&FunctionBuilder::inc, y_);
+        }
+        else if (opcode == 0xc9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP #", operand);
+            cmp(register_load(a_), constant_u8(operand));
+        }
+        else if (opcode == 0xca)
+        {
+            disassemble1(ct_pc, "DEX");
+            register_op(&FunctionBuilder::dec, x_);
+        }
+        else if (opcode == 0xcb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xcc)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CPY ", operand);
+            cmp(register_load(y_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xcd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand);
+            cmp(register_load(a_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xce)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "DEC ", operand);
+            memory_op(&FunctionBuilder::dec, abs(operand), ct_pc);
+        }
+        else if (opcode == 0xcf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_bne)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BNE ", target);
+            pending_.insert(target);
+            branch(flag_z_, false, target);
+        }
+        else if (opcode == 0xd1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, "),Y");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xd2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP (", operand, ")");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_post_index(constant_u8(operand), constant_u8(0))));
+        } 
+        else if (opcode == 0xd3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xd4)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xd5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CMP ", operand, ",X");
+            cmp(register_load(a_), 
+                memory_read(
+                    zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xd6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "DEC ", operand, ",X");
+            memory_op(&FunctionBuilder::dec, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0xd7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xd8)
+        {
+            disassemble1(ct_pc, "CLD");
+            register_store(constant_jb(jit_bool_false), flag_d_);
+        }
+        else if (opcode == 0xd9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand, ",Y");
+            cmp(register_load(a_), 
+                memory_read(
+                    abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xda)
+        {
+            disassemble1(ct_pc, "PHX");
+            push_u8(register_load(x_), ct_pc);
+        }
+        else if (opcode == 0xdb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xdc)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0xdd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CMP ", operand, ",X");
+            cmp(register_load(a_), 
+                memory_read(
+                    abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xde)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "DEC ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::dec, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0xdf)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe0)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPX #", operand);
+            cmp(register_load(x_), constant_u8(operand));
+        }
+        else if (opcode == 0xe1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, ",X)");
+            sbc(memory_read(
+                zp_pre_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xe2)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xe3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe4)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "CPX ", operand);
+            cmp(register_load(x_), memory_read(zp(operand)));
+        }
+        else if (opcode == 0xe5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC ", operand);
+            sbc(memory_read(zp(operand)));
+        }
+        else if (opcode == 0xe6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "INC ", operand);
+            memory_op(&FunctionBuilder::inc, zp(operand), ct_pc);
+        }
+        else if (opcode == 0xe7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xe8)
+        {
+            disassemble1(ct_pc, "INX");
+            register_op(&FunctionBuilder::inc, x_);
+        }
+        else if (opcode == 0xe9)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC #", operand);
+            sbc(constant_u8(operand));
+        }
+        else if (opcode == 0xea)
+        {
+            disassemble1(ct_pc, "NOP");
+        }
+        else if (opcode == 0xeb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xec)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "CPX ", operand);
+            cmp(register_load(x_), memory_read(abs(operand)));
+        }
+        else if (opcode == 0xed)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand);
+            sbc(memory_read(abs(operand)));
+        }
+        else if (opcode == 0xee)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "INC ", operand);
+            memory_op(&FunctionBuilder::inc, abs(operand), ct_pc);
+        }
+        else if (opcode == 0xef)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == opcode_beq)
+        {
+            uint16_t target;
+            disassemble_branch(ct_pc, "BEQ ", target);
+            pending_.insert(target);
+            branch(flag_z_, true, target);
+        }
+        else if (opcode == 0xf1)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, "),Y");
+            sbc(memory_read(
+                zp_post_index(constant_u8(operand), register_load(y_))));
+        }
+        else if (opcode == 0xf2)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC (", operand, ")");
+            sbc(memory_read(
+                zp_post_index(constant_u8(operand), constant_u8(0))));
+        }
+        else if (opcode == 0xf3)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xf4)
+        {
+            illegal_instruction(ct_pc, 2);
+        }
+        else if (opcode == 0xf5)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "SBC ", operand, ",X");
+            sbc(memory_read(zp_index(constant_u8(operand), register_load(x_))));
+        }
+        else if (opcode == 0xf6)
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, "INC ", operand, ",X");
+            memory_op(&FunctionBuilder::inc, 
+                      zp_index(constant_u8(operand), register_load(x_)), ct_pc);
+        }
+        else if (opcode == 0xf7)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xf8)
+        {
+            disassemble1(ct_pc, "SED");
+            register_store(constant_jb(jit_bool_true), flag_d_);
+        }
+        else if (opcode == 0xf9)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand, ",Y");
+            sbc(memory_read(
+                abs_index(constant_u16(operand), register_load(y_))));
+        }
+        else if (opcode == 0xfa)
+        {
+            disassemble1(ct_pc, "PLX");
+            llvm::Value *data = pop_u8();
+            register_store(data, x_);
+            set_nz(data);
+        }
+        else if (opcode == 0xfb)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else if (opcode == 0xfc)
+        {
+            illegal_instruction(ct_pc, 3);
+        }
+        else if (opcode == 0xfd)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "SBC ", operand, ",X");
+            sbc(memory_read(
+                abs_index(constant_u16(operand), register_load(x_))));
+        }
+        else if (opcode == 0xfe)
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, "INC ", operand, ",X");
+            memory_op(
+                &FunctionBuilder::inc, 
+                abs_index(constant_u16(operand), register_load(x_)), 
+                ct_pc);
+        }
+        else if (opcode == 0xff)
+        {
+            illegal_instruction(ct_pc, 1);
+        }
+        else
+        {
+            CANT_HAPPEN("Unknown opcode 0x" << std::hex << opcode);
+        }
+    }
+
+    return ct_pc;
+}
+
+// Return the 8-bit operand of the instruction whose opcode is located at
+// the given address.
+uint8_t FunctionBuilder::operand8(uint16_t opcode_at)
+{
+    uint16_t addr = opcode_at;
+    return ct_memory_[++addr];
+}
+
+// Return the 16-bit operand of the instruction whose opcode is located at
+// the given address.
+uint16_t FunctionBuilder::operand16(uint16_t opcode_at)
+{
+    uint16_t addr = opcode_at;
+    uint8_t operand_low = ct_memory_[++addr];
+    uint8_t operand_high = ct_memory_[++addr];
+    return operand_low | (operand_high << 8);
+} 
+
+llvm::Value *FunctionBuilder::constant_i1(bool c)
+{
+    return llvm::ConstantInt::get(i1_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u8(uint8_t c)
+{
+    return llvm::ConstantInt::get(i8_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u16(uint16_t c)
+{
+    return llvm::ConstantInt::get(i16_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u32(uint32_t c)
+{
+    return llvm::ConstantInt::get(i32_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_u64(uint64_t c)
+{
+    return llvm::ConstantInt::get(i64_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_i(int c)
+{
+    return llvm::ConstantInt::get(native_int_type_, c);
+}
+
+llvm::Value *FunctionBuilder::constant_jb(JitBool c)
+{
+    return llvm::ConstantInt::get(jit_bool_type_, c);
+}
+
+llvm::Value *FunctionBuilder::convert_i1_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i1_type_);
+    return builder_.CreateZExt(v, jit_bool_type_);
+}
+
+llvm::Value *FunctionBuilder::convert_i8_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i8_type_);
+    return v;
+}
+
+llvm::Value *FunctionBuilder::convert_i16_to_jb(llvm::Value *v)
+{
+    assert(v->getType() == i16_type_);
+    return convert_i1_to_jb(builder_.CreateICmpNE(v, constant_u16(0)));
+}
+
+// JitBool values should be tested via jit_bool_is_*() and not directly;
+// this is because they use a 0=false, non-0=true representation. It's not
+// correct to assume they are either 0 or 1.
+
+llvm::Value *FunctionBuilder::jit_bool_is_true(llvm::Value *v)
+{
+    assert(v->getType() == jit_bool_type_);
+    return builder_.CreateICmpNE(v, constant_u8(0));
+}
+
+llvm::Value *FunctionBuilder::jit_bool_is_false(llvm::Value *v)
+{
+    assert(v->getType() == jit_bool_type_);
+    return builder_.CreateICmpEQ(v, constant_u8(0));
+}
+
+llvm::Value *FunctionBuilder::convert_i1_to_i8(llvm::Value *v)
+{
+    assert(v->getType() == i1_type_);
+    return builder_.CreateZExt(v, i8_type_);
+}
+
+llvm::Value *FunctionBuilder::zext_i16(llvm::Value *v)
+{
+    return builder_.CreateZExt(v, i16_type_);
+}
+
+llvm::Value *FunctionBuilder::zext_i32(llvm::Value *v)
+{
+    return builder_.CreateZExt(v, i32_type_);
+}
+
+llvm::Value *FunctionBuilder::sext_i16(llvm::Value *v)
+{
+    return builder_.CreateSExt(v, i16_type_);
+}
+
+llvm::Value *FunctionBuilder::trunc_i8(llvm::Value *v)
+{
+    return builder_.CreateTrunc(v, i8_type_);
+}
+
+llvm::Value *FunctionBuilder::create_u16(
+    llvm::Value *low_byte, llvm::Value *high_byte)
+{
+    return builder_.CreateOr(
+        zext_i16(low_byte), 
+        builder_.CreateShl(zext_i16(high_byte), 8));
+}
+
+llvm::Value *FunctionBuilder::register_load(const Register &r)
+{
+    return builder_.CreateLoad(r.v_);
+}
+
+void FunctionBuilder::register_store(llvm::Value *v, Register &r)
+{
+    builder_.CreateStore(v, r.v_);
+    r.modified_ = true;
+}
+
+void FunctionBuilder::register_op(OpFn op, Register &r)
+{
+    llvm::Value *data = register_load(r);
+    data = (this->*op)(data);
+    register_store(data, r);
+}
+
+void FunctionBuilder::memory_op(
+    OpFn op, const BoundedAddress &ba, uint16_t next_opcode_at)
+{
+    llvm::Value *data = memory_read(ba);
+    data = (this->*op)(data);
+    memory_write(ba, data, next_opcode_at);
+}
+
+void FunctionBuilder::adc(llvm::Value *data)
+{
+    llvm::BasicBlock *done_adc_block = 
+        llvm::BasicBlock::Create(context_, "done_adc");
+    llvm::BasicBlock *adc_binary_block = 
+        llvm::BasicBlock::Create(context_, "adc_binary", llvm_function_);
+    llvm::BasicBlock *adc_decimal_block = 
+        llvm::BasicBlock::Create(context_, "adc_decimal", llvm_function_);
+    llvm::Value *d_clear = jit_bool_is_false(register_load(flag_d_));
+    builder_.CreateCondBr(d_clear, adc_binary_block, adc_decimal_block);
+    llvm_function_->getBasicBlockList().push_back(done_adc_block);
+    builder_.SetInsertPoint(adc_binary_block);
+    adc_binary(data);
+    builder_.CreateBr(done_adc_block);
+    builder_.SetInsertPoint(adc_decimal_block);
+    adc_decimal(data);
+    builder_.CreateBr(done_adc_block);
+    builder_.SetInsertPoint(done_adc_block);
+}
+
+void FunctionBuilder::adc_binary(llvm::Value *data)
+{
+    llvm::Value *carry_16 = zext_i16(jit_bool_is_true(register_load(flag_c_)));
+
+    llvm::Value *a_u16 = zext_i16(register_load(a_));
+    llvm::Value *data_u16 = zext_i16(data);
+    llvm::Value *sum_u16 = 
+        builder_.CreateAdd(builder_.CreateAdd(a_u16, data_u16), carry_16);
+
+    llvm::Value *a_s16 = builder_.CreateSExt(register_load(a_), i16_type_);
+    llvm::Value *data_s16 = builder_.CreateSExt(data, i16_type_);
+    llvm::Value *sum_s16 = 
+        builder_.CreateAdd(builder_.CreateAdd(a_s16, data_s16), carry_16);
+
+    llvm::Value *new_a = trunc_i8(sum_u16);
+    register_store(new_a, a_);
+    set_nz(new_a);
+
+    llvm::Value *b8 = builder_.CreateAnd(
+        sum_u16, 
+        constant_u16(0x100));
+    register_store(convert_i16_to_jb(b8), flag_c_);
+
+    llvm::Value *negative_as_unsigned = 
+        jit_bool_is_true(register_load(flag_n_));
+    llvm::Value *negative_as_signed = 
+        builder_.CreateICmpSLT(sum_s16, constant_u16(0));
+    llvm::Value *new_v_as_i1 =
+        builder_.CreateXor(negative_as_unsigned, negative_as_signed);
+    register_store(convert_i1_to_jb(new_v_as_i1), flag_v_);
+}
+
+void FunctionBuilder::adc_decimal(llvm::Value *data)
+{
+    // This algorithm taken from http://www.6502.org/tutorials/decimal_mode.html
+
+    llvm::Value *carry = jit_bool_is_true(register_load(flag_c_));
+
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                builder_.CreateAnd(
+                    register_load(a_),
+                    constant_u8(0x0f)),
+                builder_.CreateAnd(
+                    data,
+                    constant_u8(0x0f))),
+            convert_i1_to_i8(carry)),
+        l_tmp_);
+
+    llvm::BasicBlock *adjust_l_block = 
+        llvm::BasicBlock::Create(context_, "adjust_l", llvm_function_);
+    llvm::BasicBlock *l_done_block = 
+        llvm::BasicBlock::Create(context_, "l_done", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpUGE(
+            builder_.CreateLoad(l_tmp_), 
+            constant_u8(0x0a)),
+        adjust_l_block, l_done_block);
+
+    builder_.SetInsertPoint(adjust_l_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAnd(
+                builder_.CreateAdd(
+                    builder_.CreateLoad(l_tmp_),
+                    constant_u8(0x06)),
+                constant_u8(0x0f)),
+            constant_u8(0x10)),
+        l_tmp_);
+    builder_.CreateBr(l_done_block);
+
+    builder_.SetInsertPoint(l_done_block);
+
+    llvm::Value *a_and_0xf0 =
+        builder_.CreateAnd(
+            register_load(a_),
+            constant_u8(0xf0));
+    llvm::Value *data_and_0xf0 =
+        builder_.CreateAnd(
+            data,
+            constant_u8(0xf0));
+
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                zext_i16(a_and_0xf0),
+                zext_i16(data_and_0xf0)),
+            zext_i16(builder_.CreateLoad(l_tmp_))),
+        s_tmp_);    
+
+    llvm::BasicBlock *adjust_s_block = 
+        llvm::BasicBlock::Create(context_, "adjust_s", llvm_function_);
+    llvm::BasicBlock *s_done_block = 
+        llvm::BasicBlock::Create(context_, "s_done", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpUGE(
+            builder_.CreateLoad(s_tmp_), 
+            constant_u16(0xa0)),
+        adjust_s_block, s_done_block);
+
+    builder_.SetInsertPoint(adjust_s_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x60)),
+        s_tmp_);
+    builder_.CreateBr(s_done_block);
+
+    builder_.SetInsertPoint(s_done_block);
+    builder_.CreateStore(
+        builder_.CreateAdd(
+            builder_.CreateAdd(
+                sext_i16(a_and_0xf0),
+                sext_i16(data_and_0xf0)),
+            zext_i16(builder_.CreateLoad(l_tmp_))),
+        t_tmp_);
+
+    llvm::BasicBlock *v_not_done_block = 
+        llvm::BasicBlock::Create(context_, "v_not_done", llvm_function_);
+    llvm::BasicBlock *v_false_block = 
+        llvm::BasicBlock::Create(context_, "v_false", llvm_function_);
+    llvm::BasicBlock *v_done_block = 
+        llvm::BasicBlock::Create(context_, "v_done", llvm_function_);
+    register_store(constant_jb(jit_bool_true), flag_v_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(t_tmp_), 
+            constant_u16(-128)),
+        v_done_block, v_not_done_block);
+    builder_.SetInsertPoint(v_not_done_block);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSGT(
+            builder_.CreateLoad(t_tmp_), 
+            constant_u16(127)),
+        v_done_block, v_false_block);
+    builder_.SetInsertPoint(v_false_block);
+    register_store(constant_jb(jit_bool_false), flag_v_);
+    builder_.CreateBr(v_done_block);
+    builder_.SetInsertPoint(v_done_block);
+
+    register_store(trunc_i8(builder_.CreateLoad(s_tmp_)), a_);
+    set_nz(register_load(a_));
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpUGE(
+                builder_.CreateLoad(s_tmp_),
+                constant_u16(0x100))),
+        flag_c_);
+}
+
+void FunctionBuilder::And(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateAnd(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+llvm::Value *FunctionBuilder::asl(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_c_);
+    llvm::Value *result = builder_.CreateShl(data, 1);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::bit(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_n_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x40))), flag_v_);
+    llvm::Value *tmp = builder_.CreateAnd(register_load(a_), data);
+    set_z(tmp);
+}
+
+void FunctionBuilder::branch(Register &flag, bool branch_if, uint16_t target)
+{
+    llvm::BasicBlock *not_taken_block = 
+        llvm::BasicBlock::Create(context_, "branch_not_taken", llvm_function_);
+    ensure_address_block_created(target);
+    llvm::Value *flag_set = jit_bool_is_true(register_load(flag));
+    if (branch_if)
+    {
+        builder_.CreateCondBr(flag_set, address_block_[target], 
+                              not_taken_block);
+    }
+    else
+    {
+        builder_.CreateCondBr(flag_set, not_taken_block, 
+                              address_block_[target]);
+    }
+    builder_.SetInsertPoint(not_taken_block);
+}
+
+void FunctionBuilder::cmp(llvm::Value *r, llvm::Value *data)
+{
+    llvm::Value *sum = builder_.CreateSub(r, data);
+    set_nz(sum);
+    register_store(convert_i1_to_jb(builder_.CreateICmpUGE(r, data)), flag_c_);
+}
+
+llvm::Value *FunctionBuilder::dec(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateSub(data, constant_u8(1));
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::eor(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateXor(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+llvm::Value *FunctionBuilder::inc(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateAdd(data, constant_u8(1));
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::ld(Register &r, llvm::Value *data)
+{
+    register_store(data, r);
+    set_nz(data);
+}
+
+llvm::Value *FunctionBuilder::lsr(llvm::Value *data)
+{
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x1))), flag_c_);
+    llvm::Value *result = builder_.CreateLShr(data, 1);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::ora(llvm::Value *data)
+{
+    llvm::Value *result = builder_.CreateOr(register_load(a_), data);
+    register_store(result, a_);
+    set_nz(result);
+}
+
+void FunctionBuilder::pop_flags()
+{
+    llvm::Value *p = pop_u8();
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagN))), flag_n_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagV))), flag_v_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagD))), flag_d_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagI))), flag_i_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagZ))), flag_z_);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(p, constant_u8(flagC))), flag_c_);
+}
+
+llvm::Value *FunctionBuilder::pop_u8()
+{
+    llvm::Value *new_s = builder_.CreateAdd(register_load(s_), constant_u8(1));
+    register_store(new_s, s_);
+    return memory_read_untrapped(abs_index(constant_u16(stack), new_s));
+}
+
+
+llvm::Value *FunctionBuilder::pop_u16()
+{
+    llvm::Value *low_byte = pop_u8();
+    llvm::Value *high_byte = pop_u8();
+    return create_u16(low_byte, high_byte);
+}
+
+void FunctionBuilder::push_u8_raw(llvm::Value *data)
+{
+    memory_write_raw(abs_index(constant_u16(stack), register_load(s_)), data);
+    register_store(builder_.CreateSub(register_load(s_), constant_u8(1)), s_);
+}
+
+void FunctionBuilder::push_u16_raw(uint16_t u)
+{
+    uint8_t high_byte = u >> 8;
+    uint8_t low_byte = u & 0xff;
+    push_u8_raw(constant_u8(high_byte));
+    push_u8_raw(constant_u8(low_byte));
+}
+
+// Push the given value onto the stack.
+//
+// Note that because the push may invalidate code living on the stack,
+// this may generate intructions which return control to the caller to
+// deal with that, so within a given opcode being translated, no further
+// code-generating functions should be called after this.
+void FunctionBuilder::push_u8(llvm::Value *data, uint16_t next_opcode_at)
+{
+    llvm::Value *old_s = register_load(s_);
+    const BoundedAddress &ba = abs_index(constant_u16(stack), old_s);
+    register_store(builder_.CreateSub(old_s, constant_u8(1)), s_);
+    memory_write_untrapped(ba, data, next_opcode_at);
+}
+
+llvm::Value *FunctionBuilder::rol(llvm::Value *data)
+{
+    llvm::Value *new_low_bit = 
+        convert_i1_to_i8(jit_bool_is_true(register_load(flag_c_)));
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x80))), flag_c_);
+    llvm::Value *result = 
+        builder_.CreateOr(builder_.CreateShl(data, 1), new_low_bit);
+    set_nz(result);
+    return result;
+}
+
+llvm::Value *FunctionBuilder::ror(llvm::Value *data)
+{
+    llvm::Value *c_as_bit = 
+        convert_i1_to_i8(jit_bool_is_true(register_load(flag_c_)));
+    llvm::Value *new_high_bit = builder_.CreateShl(c_as_bit, 7);
+    register_store(
+        convert_i8_to_jb(builder_.CreateAnd(data, constant_u8(0x1))), flag_c_);
+    llvm::Value *result = 
+        builder_.CreateOr(builder_.CreateLShr(data, 1), new_high_bit);
+    set_nz(result);
+    return result;
+}
+
+void FunctionBuilder::sbc(llvm::Value *data)
+{
+    llvm::BasicBlock *done_sbc_block = 
+        llvm::BasicBlock::Create(context_, "done_sbc");
+    llvm::BasicBlock *sbc_binary_block = 
+        llvm::BasicBlock::Create(context_, "sbc_binary", llvm_function_);
+    llvm::BasicBlock *sbc_decimal_block = 
+        llvm::BasicBlock::Create(context_, "sbc_decimal", llvm_function_);
+    llvm::Value *d_clear = jit_bool_is_false(register_load(flag_d_));
+    builder_.CreateCondBr(d_clear, sbc_binary_block, sbc_decimal_block);
+    llvm_function_->getBasicBlockList().push_back(done_sbc_block);
+    builder_.SetInsertPoint(sbc_binary_block);
+    sbc_binary(data);
+    builder_.CreateBr(done_sbc_block);
+    builder_.SetInsertPoint(sbc_decimal_block);
+    sbc_decimal(data);
+    builder_.CreateBr(done_sbc_block);
+    builder_.SetInsertPoint(done_sbc_block);
+}
+
+void FunctionBuilder::sbc_binary(llvm::Value *data)
+{
+    llvm::Value *borrow_16 = 
+        zext_i16(jit_bool_is_false(register_load(flag_c_)));
+
+    sbc_overflow(data, borrow_16); // must do this before storing new value to a
+
+    llvm::Value *a_u16 = zext_i16(register_load(a_));
+    llvm::Value *data_u16 = zext_i16(data);
+    llvm::Value *result_u16 = 
+        builder_.CreateSub(builder_.CreateSub(a_u16, data_u16), borrow_16);
+
+    llvm::Value *new_a = trunc_i8(result_u16);
+    register_store(new_a, a_);
+    set_nz(new_a);
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpEQ(
+                builder_.CreateAnd(result_u16, constant_u16(0x100)),
+                constant_u16(0))),
+        flag_c_);
+}
+
+void FunctionBuilder::sbc_decimal(llvm::Value *data)
+{
+    llvm::Value *borrow = jit_bool_is_false(register_load(flag_c_));
+    llvm::Value *borrow_16 = zext_i16(borrow);
+
+    sbc_overflow(data, borrow_16); // must do this before modifying a
+
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateSub(
+                builder_.CreateAnd(
+                    register_load(a_),
+                    constant_u8(0x0f)),
+                builder_.CreateAnd(
+                    data,
+                    constant_u8(0x0f))),
+            convert_i1_to_i8(borrow)),
+        l_tmp_);
+
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateSub(
+                zext_i16(register_load(a_)),
+                zext_i16(data)),
+            borrow_16),
+        s_tmp_);
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateICmpEQ(
+                builder_.CreateAnd(
+                    builder_.CreateLoad(s_tmp_),
+                    constant_u16(0x100)),
+                constant_u16(0))),
+        flag_c_);
+
+    llvm::BasicBlock *s_adjust1_block = 
+        llvm::BasicBlock::Create(context_, "s_adjust1", llvm_function_);
+    llvm::BasicBlock *done_s_adjust1_block = 
+        llvm::BasicBlock::Create(context_, "done_s_adjust1", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0)),
+        s_adjust1_block,
+        done_s_adjust1_block);
+
+    builder_.SetInsertPoint(s_adjust1_block);
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x60)),
+        s_tmp_);
+    builder_.CreateBr(done_s_adjust1_block);
+
+    builder_.SetInsertPoint(done_s_adjust1_block);
+
+    llvm::BasicBlock *s_adjust2_block = 
+        llvm::BasicBlock::Create(context_, "s_adjust2", llvm_function_);
+    llvm::BasicBlock *done_s_adjust2_block = 
+        llvm::BasicBlock::Create(context_, "done_s_adjust2", llvm_function_);
+    builder_.CreateCondBr(
+        builder_.CreateICmpSLT(
+            builder_.CreateLoad(l_tmp_),
+            constant_u8(0)),
+        s_adjust2_block,
+        done_s_adjust2_block);
+
+    builder_.SetInsertPoint(s_adjust2_block);
+    builder_.CreateStore(
+        builder_.CreateSub(
+            builder_.CreateLoad(s_tmp_),
+            constant_u16(0x06)),
+        s_tmp_);
+    builder_.CreateBr(done_s_adjust2_block);
+
+    builder_.SetInsertPoint(done_s_adjust2_block);
+    register_store(trunc_i8(builder_.CreateLoad(s_tmp_)), a_);
+    set_nz(register_load(a_));
+}
+
+void FunctionBuilder::sbc_overflow(
+    llvm::Value *data, llvm::Value *borrow_16)
+{
+    llvm::Value *a_s16 = sext_i16(register_load(a_));
+    llvm::Value *data_s16 = sext_i16(data);
+    llvm::Value *result_s16 = 
+        builder_.CreateSub(builder_.CreateSub(a_s16, data_s16), borrow_16);
+
+    llvm::Value *negative_as_unsigned = 
+        builder_.CreateICmpNE(
+            builder_.CreateAnd(result_s16, constant_u16(0x80)),
+            constant_u16(0));
+    llvm::Value *negative_as_signed =
+        builder_.CreateICmpSLT(result_s16, constant_u16(0));
+
+    register_store(
+        convert_i1_to_jb(
+            builder_.CreateXor(negative_as_unsigned, negative_as_signed)),
+        flag_v_);
+}
+
+void FunctionBuilder::transfer(
+    const Register &from, Register &to)
+{
+    llvm::Value *data = builder_.CreateLoad(from.v_);
+    register_store(data, to);
+    set_nz(data);
+}
+
+llvm::Value *FunctionBuilder::trb(llvm::Value *data)
+{
+    set_z(builder_.CreateAnd(data, register_load(a_)));
+
+    llvm::Value *result =
+        builder_.CreateAnd(
+            data,
+            builder_.CreateXor(
+                register_load(a_),
+                constant_u8(0xff)));
+    return result;
+}
+
+llvm::Value *FunctionBuilder::tsb(llvm::Value *data)
+{
+    set_z(builder_.CreateAnd(data, register_load(a_)));
+
+    llvm::Value *result =
+        builder_.CreateOr(
+            data,
+            register_load(a_));
+    return result;
+}
+
+void FunctionBuilder::set_nz(llvm::Value *data)
+{
+    register_store(convert_i8_to_jb(builder_.CreateAnd(data, 0x80)), flag_n_);
+    set_z(data);
+}
+
+void FunctionBuilder::set_z(llvm::Value *data)
+{
+    register_store(
+        convert_i1_to_jb(builder_.CreateICmpEQ(data, constant_u8(0))), flag_z_);
+}
+
+llvm::Value *FunctionBuilder::flag_byte()
+{
+    builder_.CreateStore(constant_u8(0), p_tmp_);
+
+    flag_byte_bit(flag_n_, flagN);
+    flag_byte_bit(flag_v_, flagV);
+    flag_byte_bit(flag_d_, flagD);
+    flag_byte_bit(flag_i_, flagI);
+    flag_byte_bit(flag_z_, flagZ);
+    flag_byte_bit(flag_c_, flagC);
+
+    return builder_.CreateLoad(p_tmp_);
+}
+
+void FunctionBuilder::flag_byte_bit(const Register &flag_reg, uint8_t flag_bit)
+{
+    llvm::BasicBlock *bit_set_block = 
+        llvm::BasicBlock::Create(context_, "bit_set", llvm_function_);
+    llvm::BasicBlock *bit_done_block = 
+        llvm::BasicBlock::Create(context_, "bit_done", llvm_function_);
+    llvm::Value *bit_set = jit_bool_is_true(register_load(flag_reg));
+    builder_.CreateCondBr(bit_set, bit_set_block, bit_done_block);
+
+    builder_.SetInsertPoint(bit_set_block);
+    builder_.CreateStore(
+        builder_.CreateOr(builder_.CreateLoad(p_tmp_), flag_bit), p_tmp_);
+    builder_.CreateBr(bit_done_block);
+
+    builder_.SetInsertPoint(bit_done_block);
+}
+
+void FunctionBuilder::illegal_instruction(uint16_t &ct_pc, int bytes)
+{
+    uint16_t opcode_at = ct_pc;
+    uint8_t opcode = ct_memory_[opcode_at];
+
+    std::stringstream s;
+    s << "illegal " << hex_prefix << std::hex << std::setw(2) << 
+         std::setfill('0') << static_cast<int>(opcode) << " ";
+    switch (bytes)
+    {
+        case 1:
+            disassemble1(ct_pc, s.str());
+            break;
+
+        case 2:
+        {
+            uint8_t operand;
+            disassemble2(ct_pc, s.str(), operand);
+            break;
+        }
+
+        case 3:
+        {
+            uint16_t operand;
+            disassemble3(ct_pc, s.str(), operand);
+            break;
+        }
+
+        default:
+            CANT_HAPPEN("Invalid byte count (ct_pc 0x" << std::hex << ct_pc << 
+                        ", " << std::dec << "bytes " << bytes << ")");
+    }
+
+    if (callbacks_.illegal_instruction[opcode] != 0)
+    {
+        return_illegal_instruction(ct_pc, opcode_at, opcode);
+    }
+    else
+    {
+        // Illegal instructions are defined on the 65C02 to be no-ops.
+    }
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp(uint8_t addr)
+{
+    // We still generate a u16 for the actual llvm::Value. It probably doesn't
+    // make any difference but it seems logical as memory address "are" 16 bits,
+    // even if 8-bit ones are handled more efficiently on a real 6502.
+    return BoundedAddress(*this, constant_u16(addr), AddressRange(addr));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::abs(uint16_t addr)
+{
+    return BoundedAddress(*this, constant_u16(addr), AddressRange(addr));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::abs_index(
+    llvm::Value *abs, llvm::Value *index)
+{
+    assert(abs->getType() == i16_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::ConstantInt *abs_ci = llvm::cast<llvm::ConstantInt>(abs);
+    uint16_t range_begin = abs_ci->getLimitedValue();
+    uint32_t range_end = range_begin;
+    range_end += 0x100;
+
+    return BoundedAddress(*this, builder_.CreateAdd(abs, zext_i16(index)), 
+                          AddressRange(range_begin, range_end));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    return BoundedAddress(*this, zext_i16(builder_.CreateAdd(zp, index)), 
+                          AddressRange(0, 0x100));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_post_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::Value *low_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(zp)));
+    llvm::Value *high_byte_at = builder_.CreateAdd(zp, constant_u8(1));
+    llvm::Value *high_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(high_byte_at)));
+    llvm::Value *base_addr = create_u16(low_byte, high_byte);
+    return BoundedAddress(*this, 
+                          builder_.CreateAdd(base_addr, zext_i16(index)));
+}
+
+FunctionBuilder::BoundedAddress FunctionBuilder::zp_pre_index(
+    llvm::Value *zp, llvm::Value *index)
+{
+    assert(zp->getType() == i8_type_);
+    assert(index->getType() == i8_type_);
+
+    llvm::Value *low_byte_at = builder_.CreateAdd(zp, index);
+    llvm::Value *high_byte_at = builder_.CreateAdd(low_byte_at, constant_u8(1));
+    llvm::Value *low_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(low_byte_at)));
+    llvm::Value *high_byte = 
+        memory_read_untrapped(BoundedAddress(*this, zext_i16(high_byte_at)));
+    return BoundedAddress(*this, create_u16(low_byte, high_byte));
+}
+
+llvm::Value *FunctionBuilder::check_predicted_rts(uint16_t subroutine_addr)
+{
+    llvm::Value *mangled_pc = pop_u16();
+    llvm::Value *new_pc = builder_.CreateAdd(mangled_pc, constant_u16(1));
+
+    // It would be correct to just return new_pc at this point; our caller
+    // will use it to arrange a control transfer. Since that is a run-time
+    // determined value, the control transfer would have to be done by
+    // returning from the generated function. We may be able to make some
+    // plausible guesses (currently never guaranteed to be correct) which
+    // we can verify at run time and which if correct allow the RTS to be
+    // handled as a branch within the generated function. This should save
+    // a bit of overhead on not returning from the function and re-entering
+    // another and may also allow the optimiser some additional leeway.
+
+    const AddressSet &targets = predicted_rts_targets_[subroutine_addr];
+    TRACE("Generating predicted RTS code; " << targets.size() << " target(s)");
+    for (AddressSet::const_iterator it = targets.begin(); it != targets.end(); 
+         ++it)
+    {
+        const uint16_t target = *it;
+        llvm::BasicBlock *prediction_correct = 
+            llvm::BasicBlock::Create(context_, "prediction_correct", 
+                                     llvm_function_);
+        llvm::BasicBlock *prediction_incorrect = 
+            llvm::BasicBlock::Create(context_, "prediction_incorrect", 
+                                     llvm_function_);
+        builder_.CreateCondBr(
+            builder_.CreateICmpEQ(constant_u16(target), new_pc), 
+            prediction_correct, prediction_incorrect);
+        builder_.SetInsertPoint(prediction_correct);
+        control_transfer_to(constant_u16(target), opcode_rts);
+        builder_.SetInsertPoint(prediction_incorrect);
+    }
+
+    return new_pc;
+}
+
+void FunctionBuilder::control_transfer_to(llvm::Value *target, uint8_t opcode)
+{
+    assert(target->getType() == i16_type_);
+
+    switch (opcode)
+    {
+        case opcode_rts:
+        case opcode_rti:
+        case opcode_bra:
+        case opcode_bcc:
+        case opcode_bcs:
+        case opcode_bvc:
+        case opcode_bvs:
+        case opcode_beq:
+        case opcode_bne:
+        case opcode_bmi:
+        case opcode_bpl:
+        case opcode_implicit:
+            // This control transfer never triggers a call callback.
+            break;
+
+        case opcode_jsr:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is known at compile time.
+            llvm::ConstantInt *target_ci = 
+                llvm::cast<llvm::ConstantInt>(target);
+            uint16_t target16 = target_ci->getLimitedValue();
+            if (callbacks_.call[target16] != 0)
+            {
+                return_jsr_complex(target);
+                return;
+            }
+
+            // We also need to check if the two bytes pushed onto the stack by
+            // the JSR have invalidated any JITted code and return control to
+            // our caller if so.
+            //
+            // Note that we work with a tmp_s i8 local so that if the stack
+            // pointer wrapped during the JSR pushes we will still work
+            // correctly here.
+            llvm::Value *tmp_s = 
+                builder_.CreateAdd(register_load(s_), constant_u8(1));
+            llvm::Value *stack_addr1 = 
+                builder_.CreateAdd(constant_u16(stack), zext_i16(tmp_s));
+            tmp_s = builder_.CreateAdd(tmp_s, constant_u8(1));
+            llvm::Value *stack_addr2 = 
+                builder_.CreateAdd(constant_u16(stack), zext_i16(tmp_s));
+
+            llvm::BasicBlock *code_not_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_not_modified");
+            llvm::BasicBlock *code_addr1_not_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_addr1_not_modified", 
+                                         llvm_function_);
+            llvm::BasicBlock *code_modified_block = 
+                llvm::BasicBlock::Create(context_, "code_modified", 
+                                         llvm_function_);
+
+            const AddressRange stack_range(stack, stack + 0x100);
+            llvm::Value *stack_addr1_is_code = 
+                is_code_at(BoundedAddress(*this, stack_addr1, stack_range));
+            builder_.CreateCondBr(stack_addr1_is_code, code_modified_block, 
+                                  code_addr1_not_modified_block);
+
+            builder_.SetInsertPoint(code_addr1_not_modified_block);
+            llvm::Value *stack_addr2_is_code = 
+                is_code_at(BoundedAddress(*this, stack_addr2, stack_range));
+            builder_.CreateCondBr(stack_addr2_is_code, code_modified_block, 
+                                  code_not_modified_block);
+
+            builder_.SetInsertPoint(code_modified_block);
+            return_jsr_complex(target);
+
+            llvm_function_->getBasicBlockList().push_back(
+                code_not_modified_block);
+            builder_.SetInsertPoint(code_not_modified_block);
+            break;
+        }
+
+        case opcode_jmp_abs:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is known at compile time.
+            llvm::ConstantInt *target_ci = 
+                llvm::cast<llvm::ConstantInt>(target);
+            uint16_t target16 = target_ci->getLimitedValue();
+            if (callbacks_.call[target16] != 0)
+            {
+                return_control_transfer_indirect(target, opcode);
+                return;
+            }
+            break;
+        }
+
+        case opcode_jmp_ind_abs:
+        case opcode_jmp_indx_abs:
+        {
+            // This control transfer triggers a call callback if present. The
+            // target address is only known at run time.
+            assert(!llvm::isa<llvm::ConstantInt>(target));
+            llvm::Value *call_callback_addr = builder_.CreateGEP(
+                call_callbacks_, 
+                llvm::ArrayRef<llvm::Value *>(zext_i32(target)));
+            llvm::Value *call_callback = 
+                builder_.CreateLoad(call_callback_addr);
+            llvm::BasicBlock *call_callback_block = 
+                llvm::BasicBlock::Create(context_, "call_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_call_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_call_callback", 
+                                         llvm_function_);
+            llvm::Value *call_callback_not_null = 
+                builder_.CreateIsNotNull(call_callback);
+            builder_.CreateCondBr(call_callback_not_null, call_callback_block, 
+                                  no_call_callback_block);
+
+            builder_.SetInsertPoint(call_callback_block);
+            return_control_transfer_indirect(target, opcode);
+
+            builder_.SetInsertPoint(no_call_callback_block);
+            break;
+        }
+    
+        default:
+            CANT_HAPPEN("Unexpected opcode 0x" << std::hex << opcode);
+    }
+
+    llvm::ConstantInt *target_ci = llvm::dyn_cast<llvm::ConstantInt>(target);
+    if ((target_ci != 0) && (
+            code_generated_for_address_[target_ci->getLimitedValue()] ||
+            (pending_.find(target_ci->getLimitedValue()) != pending_.end())))
+    {
+        ensure_address_block_created(target_ci->getLimitedValue());
+        // The target is within this function, so we can just branch there.
+        builder_.CreateBr(address_block_[target_ci->getLimitedValue()]);
+    }
+    else
+    {
+        // The target isn't (knowably) within this function, so we have to
+        // get there via our caller.
+        return_control_transfer_direct(target);
+    }
+}
+
+// All memory reads should be done via a call to this function, unless they are
+// explicitly exempt from read callbacks.
+llvm::Value *FunctionBuilder::memory_read(const BoundedAddress &ba)
+{
+    llvm::Value *addr = ba.addr();
+
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(addr);
+    if (addr_ci != 0)
+    {
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        TRACE("Load at compile-time constant address 0x" << std::hex << 
+              std::setfill('0') << std::setw(4) << addr16);
+        if (callbacks_.read[addr16] != 0)
+        {
+            TRACE("Read callback exists at constant address");
+            llvm::Value *callback = 
+                constant_ptr(callbacks_.read[addr16], "read_callback");
+            return call_read_callback(callback, addr);
+        }
+    
+        // Actually do the read from memory.
+        return memory_read_untrapped(ba);
+    }
+    else
+    {
+        if (callback_in_bounds(callbacks_.read, ba.bounds()))
+        {
+            TRACE("Read callback may exist; runtime check required");
+            llvm::Value *read_callback_addr = builder_.CreateGEP(
+                read_callbacks_, llvm::ArrayRef<llvm::Value *>(zext_i32(addr)));
+            llvm::Value *read_callback = 
+                builder_.CreateLoad(read_callback_addr);
+            llvm::BasicBlock *read_callback_block = 
+                llvm::BasicBlock::Create(context_, "read_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_read_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_read_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *memory_read_done_block = 
+                llvm::BasicBlock::Create(context_, "memory_read_done");
+            llvm::Value *read_callback_not_null = 
+                builder_.CreateIsNotNull(read_callback);
+            builder_.CreateCondBr(read_callback_not_null, read_callback_block, 
+                                  no_read_callback_block);
+
+            builder_.SetInsertPoint(read_callback_block);
+            llvm::Value *result = call_read_callback(read_callback, ba.addr());
+            builder_.CreateStore(result, read_callback_result_);
+            builder_.CreateBr(memory_read_done_block);
+
+            builder_.SetInsertPoint(no_read_callback_block);
+            builder_.CreateStore(memory_read_untrapped(ba), 
+                                 read_callback_result_);
+            builder_.CreateBr(memory_read_done_block);
+            
+            llvm_function_->getBasicBlockList().push_back(
+                memory_read_done_block);
+            builder_.SetInsertPoint(memory_read_done_block);
+            return builder_.CreateLoad(read_callback_result_);
+        }
+        else
+        {
+            TRACE("No read callback within address bounds");
+            // Actually do the read from memory.
+            return memory_read_untrapped(ba);
+        }
+    }
+}
+
+llvm::Value *FunctionBuilder::memory_read_untrapped(const BoundedAddress &ba)
+{
+    llvm::Value *host_addr = builder_.CreateGEP(
+        memory_base_, llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+    return builder_.CreateLoad(host_addr);
+}
+
+// All memory writes should be done via a call to this function, unless they
+// are explicitly exempt from triggering write callbacks.
+//
+// Note that because this may return to the caller to indicate
+// result_write_to_code or result_write_callback, it must be the last
+// code-generation function called when translating an opcode, as any
+// subsequent code may not be executed.
+void FunctionBuilder::memory_write(const BoundedAddress &ba,
+                                 llvm::Value *data, uint16_t next_opcode_at)
+{
+    llvm::ConstantInt *addr_ci = llvm::dyn_cast<llvm::ConstantInt>(ba.addr());
+    if (addr_ci != 0)
+    {
+        uint16_t addr16 = addr_ci->getLimitedValue();
+        TRACE("Store at compile-time constant address 0x" << std::hex << 
+              std::setfill('0') << std::setw(4) << addr16);
+        if (callbacks_.write[addr16] != 0)
+        {
+            TRACE("Write callback exists at constant address");
+            return_write_callback(next_opcode_at, ba.addr(), data);
+            return;
+        }
+    }
+    else
+    {
+        if (callback_in_bounds(callbacks_.write, ba.bounds()))
+        {
+            TRACE("Write callback may exist; runtime check required");
+            llvm::Value *write_callback_addr = builder_.CreateGEP(
+                write_callbacks_, 
+                llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+            llvm::Value *write_callback = 
+                builder_.CreateLoad(write_callback_addr);
+            llvm::BasicBlock *write_callback_block = 
+                llvm::BasicBlock::Create(context_, "write_callback", 
+                                         llvm_function_);
+            llvm::BasicBlock *no_write_callback_block = 
+                llvm::BasicBlock::Create(context_, "no_write_callback", 
+                                         llvm_function_);
+            llvm::Value *write_callback_not_null = 
+                builder_.CreateIsNotNull(write_callback);
+            builder_.CreateCondBr(write_callback_not_null, write_callback_block, 
+                                  no_write_callback_block);
+
+            builder_.SetInsertPoint(write_callback_block);
+            return_write_callback(next_opcode_at, ba.addr(), data);
+
+            builder_.SetInsertPoint(no_write_callback_block);
+        }
+        else
+        {
+            TRACE("No write callback within address bounds");
+        }
+    }
+
+    memory_write_untrapped(ba, data, next_opcode_at);
+}
+
+// Note that (like lib6502 proper) we don't externalise our registers before
+// invoking the (read/write) callback or internalise them afterwards, so
+// the callback doesn't see correct information if it examines the CPU state.
+llvm::Value *FunctionBuilder::call_callback(
+    llvm::Value *callback, llvm::Value *addr, 
+    llvm::Value *data)
+{
+    return builder_.CreateCall3(callback, mpu_llvm_, addr, data, 
+                                "callback_result");
+}
+
+llvm::Value *FunctionBuilder::call_read_callback(
+    llvm::Value *callback, llvm::Value *addr)
+{
+    llvm::Value *result_int = call_callback(callback, addr, constant_u8(0));
+    return builder_.CreateTrunc(result_int, i8_type_);
+}
+
+// Write to memory with no checks for modification of already JITted code or
+// write callbacks.
+void FunctionBuilder::memory_write_raw(const BoundedAddress &ba,
+                                     llvm::Value *data)
+{
+    llvm::Value *host_addr = builder_.CreateGEP(
+        memory_base_, llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+    builder_.CreateStore(data, host_addr);
+}
+
+llvm::Value *FunctionBuilder::is_code_at(const BoundedAddress &ba)
+{
+    const AddressRange &bounds = ba.bounds();
+    bool use_optimistic_write = !bounds.all_memory();
+    for (AddressRange::const_iterator it = bounds.begin(); 
+         use_optimistic_write && (it != bounds.end()); ++it)
+    {
+        uint16_t i = *it;
+        if (code_at_address_[i])
+        {
+            TRACE("BoundedAddress " << ba << 
+                  " includes known code at 0x" << std::hex << 
+                  std::setfill('0') << std::setw(4) << i << 
+                  "; can't use optimistic write");
+            use_optimistic_write = false;
+        }
+    }
+    
+    if (use_optimistic_write)
+    {
+        optimistic_writes_.insert(ba.bounds());
+        return constant_i1(false);
+    }
+    else
+    {
+        llvm::Value *code_at_address_flag_addr = builder_.CreateGEP(
+            code_at_address_llvm_, 
+            llvm::ArrayRef<llvm::Value *>(zext_i32(ba.addr())));
+        return jit_bool_is_true(builder_.CreateLoad(code_at_address_flag_addr));
+    }
+}
+
+// Write to memory, checking for modification of already JITted code but
+// not for write callbacks.
+//
+// Note that because this may return to the caller to indicate
+// result_write_to_code, it must be the last code-generation function called
+// when translating an opcode, as any subsequent code may not be executed.
+void FunctionBuilder::memory_write_untrapped(
+    const BoundedAddress &ba, llvm::Value *data, 
+    uint16_t next_opcode_at)
+{
+    // Actually do the write.
+    memory_write_raw(ba, data);
+
+    // Check for writes which modify JITted code.
+    llvm::Value *just_modified_code = is_code_at(ba);
+
+    // The optimiser would eliminate the dead branches if just_modified_code
+    // is a constant false value, but to make the IR easier to read and perhaps
+    // help the optimiser out, let's not generate pointless code in this case.
+    llvm::ConstantInt *just_modified_ci = 
+        llvm::dyn_cast<llvm::ConstantInt>(just_modified_code);
+    if ((just_modified_ci != 0) && !(just_modified_ci->getLimitedValue()))
+    {
+        return;
+    }
+
+    llvm::BasicBlock *code_modified_block = 
+        llvm::BasicBlock::Create(context_, "code_modified", llvm_function_);
+    llvm::BasicBlock *code_not_modified_block = 
+        llvm::BasicBlock::Create(context_, "code_not_modified", llvm_function_);
+    builder_.CreateCondBr(just_modified_code, code_modified_block, 
+                          code_not_modified_block);
+
+    builder_.SetInsertPoint(code_modified_block);
+    return_write_to_code(next_opcode_at, ba.addr());
+
+    builder_.SetInsertPoint(code_not_modified_block);
+}
+
+void FunctionBuilder::return_pc(Result result, llvm::Value *new_pc)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_addr(Result result, llvm::Value *new_pc, 
+                                     llvm::Value *addr)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(addr, builder_.CreateStructGEP(registers_, 11));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_data(Result result, llvm::Value *new_pc, 
+                                     llvm::Value *data)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(data, builder_.CreateStructGEP(registers_, 12));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_pc_addr_data(
+    Result result, llvm::Value *new_pc, llvm::Value *addr, llvm::Value *data)
+{
+    builder_.CreateStore(constant_i(result), function_result_);
+    builder_.CreateStore(new_pc, pc_);
+    builder_.CreateStore(addr, builder_.CreateStructGEP(registers_, 11));
+    builder_.CreateStore(data, builder_.CreateStructGEP(registers_, 12));
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::return_control_transfer_direct(llvm::Value *new_pc)
+{
+    return_pc(result_control_transfer_direct, new_pc);
+}
+
+void FunctionBuilder::return_control_transfer_indirect(
+    llvm::Value *new_pc, uint8_t opcode)
+{
+    return_pc_data(result_control_transfer_indirect, new_pc, 
+                   constant_u8(opcode));
+}
+
+void FunctionBuilder::return_brk(llvm::Value *new_pc)
+{
+    return_pc(result_brk, new_pc);
+}
+
+void FunctionBuilder::return_jsr_complex(llvm::Value *new_pc)
+{
+    return_pc(result_jsr_complex, new_pc);
+}
+
+void FunctionBuilder::return_illegal_instruction(
+    uint16_t new_pc, uint16_t opcode_at, uint8_t opcode)
+{
+    return_pc_addr_data(result_illegal_instruction, constant_u16(new_pc), 
+                        constant_u16(opcode_at), constant_u8(opcode));
+}
+
+void FunctionBuilder::return_write_to_code(uint16_t new_pc, llvm::Value *addr)
+{
+    return_pc_addr(result_write_to_code, constant_u16(new_pc), addr);
+}
+
+void FunctionBuilder::return_write_callback(
+    uint16_t new_pc, llvm::Value *addr, llvm::Value *data)
+{
+    return_pc_addr_data(
+        result_write_callback, constant_u16(new_pc), addr, data);
+}
+
+void FunctionBuilder::return_invalid_bounds()
+{
+    builder_.CreateStore(constant_i(result_invalid_bounds), function_result_);
+    builder_.CreateBr(epilogue_);
+}
+
+void FunctionBuilder::disassemble1(uint16_t &addr, const std::string &s)
+{
+    disassemble_hex_dump(addr, 1);
+    disassembly_ << s << "\n";
+    ++addr;
+}
+
+void FunctionBuilder::disassemble2(
+    uint16_t &addr, const std::string &prefix, uint8_t &operand, 
+    const std::string &suffix)
+{
+    disassemble_hex_dump(addr, 2);
+    operand = operand8(addr);
+    disassembly_ << prefix << hex_prefix << std::setw(2) << 
+                    static_cast<int>(operand) << suffix;
+
+    // This is a bit of a special case, but it works so...
+    std::string::size_type l = prefix.length();
+    if ((l > 1) && (prefix[l - 1] == '#') && isprint(operand))
+    {
+        disassembly_ << " ('" << static_cast<char>(operand) << "')";
+    }
+
+    disassembly_ << "\n";
+
+    addr += 2;
+}
+
+void FunctionBuilder::disassemble3(
+    uint16_t &addr, const std::string &prefix, uint16_t &operand, 
+    const std::string &suffix)
+{
+    disassemble_hex_dump(addr, 3);
+    operand = operand16(addr);
+    disassembly_ << prefix << hex_prefix << std::setw(4) << operand << suffix << 
+                    "\n";
+    addr += 3;
+}
+
+void FunctionBuilder::disassemble_branch(
+    uint16_t &addr, const std::string &s, uint16_t &target)
+{
+    disassemble_hex_dump(addr, 2);
+    uint8_t operand = operand8(addr);
+    int offset = (operand < 0x80) ? operand : -(0x100 - operand);
+    // The branch is relative to the PC *after* it's been moved past the
+    // branch instruction.
+    addr += 2;
+    target = addr + offset;
+    disassembly_ << s << hex_prefix << std::setw(4) << target << "\n";
+}
+
+void FunctionBuilder::disassemble_hex_dump(uint16_t addr, int bytes)
+{ 
+    assert(bytes <= 3);
+    disassembly_ << std::hex << std::setw(4) << std::setfill('0') << addr << 
+                    " ";
+    for (int i = 0; i < 3; ++i)
+    {
+        if (i < bytes)
+        {
+            disassembly_ << std::setw(2) << 
+                            static_cast<int>(ct_memory_[addr + i]) << " ";
+        }
+        else
+        {
+            disassembly_ << "   ";
+        }
+    }
+}
diff --git a/FunctionBuilder.h b/FunctionBuilder.h
new file mode 100644
index 0000000..da2df8d
--- /dev/null
+++ b/FunctionBuilder.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTIONBUILDER_H
+#define FUNCTIONBUILDER_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/Value.h"
+#include <map>
+#include <set>
+#include <sstream>
+
+#include "AddressSet.h"
+#include "const.h"
+#include "JitBool.h"
+#include "lib6502.h"
+
+class Function;
+struct LLVMStuff;
+
+class FunctionBuilder : boost::noncopyable
+{
+public:
+    // Create a FunctionBuilder object which can be used to build a Function
+    // representing the code starting at 'address'. The Function object built
+    // will operate on the given M6502 object. The 'code_at_address' array
+    // will be used at compile time and at runtime to decide if writes to
+    // memory may invalidate already JITted code. The memory inside the M6502
+    // object will be used when the Funtion object executes, but ct_memory
+    // will be used at compile time to determine the instructions to compile;
+    // see FunctionManager for more on this.
+    FunctionBuilder(M6502 *mpu, const uint8_t *ct_memory, 
+                    JitBool *code_at_address, uint16_t address);
+
+    boost::shared_ptr<Function> build();
+
+    // Status codes returned by the JITted function
+    enum Result
+    {
+        // Control has transferred to the address in registers.pc. No call
+        // callback should be invoked, either because the JITted function knows
+        // there is no applicable call callback or because the control transfer
+        // is via an instruction which does not trigger call callbacks.
+        result_control_transfer_direct,
+
+        // Control has transferred to the address in registers.pc via an
+        // instruction which is eligible for call callbacks. registers.data
+        // contains the opcode of the instruction which transferred
+        // control. The caller should check for an applicable call
+        // callback. registers.addr is *not* updated; the addr value for
+        // the callback is registers.pc.
+        result_control_transfer_indirect,
+
+        // A BRK instruction has just been executed and registers.pc updated
+        // to point to the BRK vector. The caller should check to see if the
+        // stack pushes implicitly performed by BRK have invalidated any
+        // already-JITted code and for a call callback on the BRK vector.
+        // Neither registers.addr nor registers.data are updated.
+        result_brk,
+
+        // A JSR instruction has just been executed and registers.pc
+        // updated to point to the destination address. One or both of the
+        // following may be true: - the stack pushes implicitly performed
+        // have invalidated some
+        //   already-JITted code
+        // - a call callback is registered on the destination address It is not
+        // guaranteed that either of these is the case, although in practice
+        // with this implementation at least one should be true. Not all JSR
+        // instructions will necessarily cause the JITted function to return
+        // this value, hence the result code is result_jsr_*complex* not just
+        // result_jsr. Neither registers.addr nor registers.data are updated.
+        result_jsr_complex,
+
+        // An illegal instruction has been executed and registers.pc updated to
+        // point to the following opcode. registers.addr contains the address
+        // of the illegal instruction and registers.data its opcode. The
+        // caller should check to see if a callback is registered.
+        result_illegal_instruction,
+
+        // A memory write has been executed which changed an address marked
+        // as holding code. registers.addr contains the address modified. The
+        // caller should invalidate any JITted functions for this address.
+        result_write_to_code,
+
+        // A memory write has occurred which triggers a write callback. Memory
+        // has not been updated. registers.addr and registers.data contain the
+        // address and the data being written respectively. The caller should
+        // invoke the write callback and check for writes to already-JITted
+        // code.
+        result_write_callback,
+
+        // Internal bounds generated for an instruction's address range were
+        // found to be invalid by self-checking code. This can only occur
+        // in debug builds and then only if there is a bug in FunctionBuilder.
+        result_invalid_bounds
+    };
+
+private:
+    uint16_t build_at(uint16_t ct_pc);
+
+    uint8_t operand8(uint16_t opcode_at);
+    uint16_t operand16(uint16_t opcode_at);
+
+    llvm::Value *constant_i1(bool c);
+    llvm::Value *constant_u8(uint8_t c);
+    llvm::Value *constant_u16(uint16_t c);
+    llvm::Value *constant_u32(uint32_t c);
+    llvm::Value *constant_u64(uint64_t c);
+
+    template <class T>
+    llvm::Value *constant_ptr(T *p, const std::string &name)
+    {
+        llvm::Value *v = constant_u64(reinterpret_cast<unsigned long>(p));
+        // The name passed in never seems to be used, but maybe this will
+        // change in the future. It doesn't really do us any harm to pass
+        // it in anyway.
+        return builder_.CreateIntToPtr(
+            v, llvm::TypeBuilder<T *, false>::get(llvm::getGlobalContext()), 
+            name);
+    }
+
+    llvm::Value *constant_i(int c);
+
+    llvm::Value *constant_jb(JitBool c);
+    llvm::Value *convert_i1_to_jb(llvm::Value *v);
+    llvm::Value *convert_i8_to_jb(llvm::Value *v);
+    llvm::Value *convert_i16_to_jb(llvm::Value *v);
+    llvm::Value *jit_bool_is_true(llvm::Value *v);
+    llvm::Value *jit_bool_is_false(llvm::Value *v);
+
+    llvm::Value *convert_i1_to_i8(llvm::Value *v);
+
+    llvm::Value *zext_i16(llvm::Value *v);
+    llvm::Value *zext_i32(llvm::Value *v);
+    llvm::Value *sext_i16(llvm::Value *v);
+    llvm::Value *trunc_i8(llvm::Value *v);
+    llvm::Value *create_u16(llvm::Value *low_byte, llvm::Value *high_byte);
+
+    struct Register
+    {
+        llvm::Value *v_;
+        bool modified_;
+    };
+    void initialise_i8_reg(Register &r, int structure_index, 
+                           const std::string &name);
+    void initialise_jb_reg(Register &r, int structure_index, 
+                           const std::string &name);
+
+    void ensure_address_block_created(uint16_t addr);
+
+    void return_pc(Result result, llvm::Value *new_pc);
+    void return_pc_addr(Result result, llvm::Value *new_pc, llvm::Value *addr);
+    void return_pc_data(Result result, llvm::Value *new_pc, llvm::Value *data);
+    void return_pc_addr_data(Result result, llvm::Value *new_pc, 
+                             llvm::Value *addr, llvm::Value *data);
+    void return_control_transfer_direct(llvm::Value *new_pc);
+    void return_control_transfer_indirect(llvm::Value *new_pc, uint8_t opcode);
+    void return_brk(llvm::Value *new_pc);
+    void return_jsr_complex(llvm::Value *new_pc);
+    void return_illegal_instruction(uint16_t new_pc, uint16_t opcode_at, 
+                                    uint8_t opcode);
+    void return_write_to_code(uint16_t new_pc, llvm::Value *addr);
+    void return_write_callback(uint16_t new_pc, llvm::Value *addr, 
+                               llvm::Value *data);
+    void return_invalid_bounds();
+
+    class BoundedAddress;
+
+    llvm::Value *register_load(const Register &r);
+    void register_store(llvm::Value *v, Register &r);
+
+    typedef llvm::Value *(FunctionBuilder::*OpFn)(llvm::Value *data);
+    void register_op(OpFn op, Register &r);
+    void memory_op(OpFn op, const BoundedAddress &ba, uint16_t next_opcode_at);
+
+    llvm::Value *is_code_at(const BoundedAddress &addr);
+
+    void adc(llvm::Value *data);
+    void adc_llvm(llvm::Value *data);
+    void adc_binary(llvm::Value *data);
+    void adc_decimal(llvm::Value *data);
+    void adc_binary_llvm(llvm::Value *data);
+    void adc_decimal_llvm(llvm::Value *data);
+    void And(llvm::Value *data);
+    llvm::Value *asl(llvm::Value *data);
+    void bit(llvm::Value *data);
+    void branch(Register &flag, bool branch_if, uint16_t target);
+    void cmp(llvm::Value *r, llvm::Value *data);
+    void cmp_llvm(llvm::Value *r, llvm::Value *data);
+    llvm::Value *dec(llvm::Value *data);
+    void eor(llvm::Value *data);
+    llvm::Value *inc(llvm::Value *data);
+    void ld(Register &r, llvm::Value *data);
+    llvm::Value *lsr(llvm::Value *data);
+    void ora(llvm::Value *data);
+    void pop_flags();
+    llvm::Value *pop_u8();
+    llvm::Value *pop_u16();
+    void push_u8_raw(llvm::Value *data);
+    void push_u16_raw(uint16_t u);
+    void push_u8(llvm::Value *data, uint16_t next_opcode_at);
+    llvm::Value *rol(llvm::Value *data);
+    llvm::Value *ror(llvm::Value *data);
+    void sbc(llvm::Value *data);
+    void sbc_binary(llvm::Value *data);
+    void sbc_decimal(llvm::Value *data);
+    void sbc_overflow(llvm::Value *data, 
+                      llvm::Value *borrow);
+    void transfer(const Register &from, Register &to);
+    llvm::Value *trb(llvm::Value *data);
+    llvm::Value *tsb(llvm::Value *data);
+
+    void set_nz(llvm::Value *data);
+    void set_z(llvm::Value *data);
+
+    llvm::Value *flag_byte();
+    void flag_byte_bit(const Register &flag_reg, uint8_t flag_bit);
+
+    void illegal_instruction(uint16_t &ct_pc, int bytes);
+
+    BoundedAddress zp(uint8_t addr);
+    BoundedAddress abs(uint16_t addr);
+    BoundedAddress abs_index(llvm::Value *abs, 
+                           llvm::Value *index);
+    BoundedAddress zp_index(llvm::Value *zp, 
+                             llvm::Value *r);
+    BoundedAddress zp_post_index(
+        llvm::Value *zp, llvm::Value *index);
+    BoundedAddress zp_pre_index(
+        llvm::Value *zp, llvm::Value *index);
+
+    llvm::Value *check_predicted_rts(uint16_t subroutine_addr);
+
+    // A special opcode used as the third argument to control_transfer_to
+    // when there is no explicit opcode causing the control transfer; this
+    // is just a documented way to signal that the control transfer is direct
+    // and cannot trigger a call callback.
+    enum {
+        opcode_implicit = 0xff
+    };
+    void control_transfer_to(llvm::Value *target, uint8_t opcode);
+
+    llvm::Value *memory_read(const BoundedAddress &ba);
+    llvm::Value *memory_read_untrapped(const BoundedAddress &ba);
+
+    void memory_write(const BoundedAddress &ba,
+                           llvm::Value *data, uint16_t next_opcode_at);
+    void memory_write_untrapped(const BoundedAddress &ba,
+                                llvm::Value *data, uint16_t next_opcode_at);
+    void memory_write_raw(const BoundedAddress &ba,
+                               llvm::Value *data);
+
+    llvm::Value *call_callback(
+        llvm::Value *callback, llvm::Value *addr, 
+        llvm::Value *data);
+    llvm::Value *call_read_callback(
+        llvm::Value *callback, llvm::Value *addr);
+
+    void disassemble1(uint16_t &addr, const std::string &s);
+    void disassemble2(uint16_t &addr, const std::string &prefix, 
+                      uint8_t &operand, const std::string &suffix = "");
+    void disassemble3(uint16_t &addr, const std::string &prefix, 
+                      uint16_t &operand, const std::string &suffix = "");
+    void disassemble_branch(uint16_t &addr, const std::string &s, 
+                            uint16_t &target);
+    void disassemble_hex_dump(uint16_t addr, int bytes);
+
+    bool built_;
+
+    M6502 *const mpu_;
+    JitBool *code_at_address_;
+    const uint16_t address_;
+    const uint8_t *const ct_memory_;
+    // callbacks_ is strictly redundant as it's available inside mpu, but
+    // it's convenient.
+    const M6502_Callbacks &callbacks_;
+
+    AddressSet code_range_;
+    AddressSet optimistic_writes_;
+
+    std::stringstream disassembly_;
+
+    int instructions_;
+    const int max_instructions_;
+
+    // This could be an AddressSet but since we "rely" on the order of
+    // iteration for pending_ it seems better to be explicit; we don't need
+    // any of the range-handling convenience of AddressSet here anyway.
+    std::set<uint16_t> pending_;
+
+    std::map<uint16_t, AddressSet> predicted_rts_targets_;
+
+    llvm::LLVMContext &context_;
+
+    llvm::Type *const native_int_type_;
+    llvm::PointerType *const callback_type_;
+    llvm::Type *const i1_type_;
+    llvm::Type *const i8_type_;
+    llvm::Type *const i16_type_;
+    llvm::Type *const i32_type_;
+    llvm::Type *const i64_type_;
+    llvm::Type *const jit_bool_type_;
+
+    llvm::IRBuilder<> &builder_;
+
+    llvm::Function *llvm_function_;
+
+    llvm::Value *registers_;
+    llvm::Value *code_at_address_llvm_;
+    llvm::Value *read_callbacks_;
+    llvm::Value *write_callbacks_;
+    llvm::Value *call_callbacks_;
+    llvm::Value *memory_base_;
+    llvm::Value *mpu_llvm_;
+
+    llvm::Value *function_result_;
+
+    // Note that address_block_ and code_generated_for_address_ aren't
+    // redundant; address_block_ elements are created (for example) when
+    // a branch means the corresponding address must have a BasicBlock
+    // created for use as a branch target, but that doesn't mean code has
+    // been generated for it yet.
+    llvm::BasicBlock *address_block_[memory_size];
+    bool code_generated_for_address_[memory_size];
+
+    Register a_;
+    Register x_;
+    Register y_;
+    Register s_;
+    Register flag_n_;
+    Register flag_v_;
+    Register flag_d_;
+    Register flag_i_;
+    Register flag_z_;
+    Register flag_c_;
+    llvm::Value *pc_;
+
+    llvm::Value *read_callback_result_;
+    llvm::Value *p_tmp_;
+    llvm::Value *l_tmp_;
+    llvm::Value *s_tmp_;
+    llvm::Value *t_tmp_;
+
+    llvm::BasicBlock *epilogue_;
+};
+
+#endif
diff --git a/FunctionManager.cpp b/FunctionManager.cpp
new file mode 100644
index 0000000..51f60b7
--- /dev/null
+++ b/FunctionManager.cpp
@@ -0,0 +1,310 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "FunctionManager.h"
+
+#include <functional>
+
+#include "Function.h"
+#include "FunctionBuilder.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+FunctionManager::FunctionManager(M6502 *mpu)
+: jit_thread_idle_(true), work_available_(false), quit_(false), mpu_(mpu), 
+  memory_snapshot_(), function_for_address_(), code_at_address_()
+{
+}
+
+FunctionManager::~FunctionManager()
+{
+    if (jit_thread_.get_id() != boost::thread::id())
+    {
+        TRACE("Notifying JIT thread to quit");
+        {
+            boost::mutex::scoped_lock lock(jit_thread_cv_mutex_);
+            quit_ = true;
+        }
+        jit_thread_cv_.notify_all();
+        TRACE("Joining with JIT thread");
+        jit_thread_.join();
+    }
+}
+
+bool FunctionManager::jit_thread_idle()
+{
+    boost::mutex::scoped_lock lock(jit_thread_idle_mutex_);
+    return jit_thread_idle_;
+}
+
+void FunctionManager::update_memory_snapshot()
+{
+    assert(jit_thread_idle());
+
+    const uint8_t *memory = mpu_->memory;
+    for (size_t i = 0; i < memory_size; ++i)
+    {
+        if (code_at_address_[i] && (memory_snapshot_[i] != memory[i]))
+        {
+            code_modified_at(i);
+        }
+        memory_snapshot_[i] = memory[i];
+    }
+}
+
+Function *FunctionManager::build_function_internal(
+    uint16_t address, const uint8_t *ct_memory)
+{
+    Registers &registers = mpu_->internal->registers_;
+    TRACE("Building Function for code at 0x" << std::hex << std::setfill('0') <<
+          std::setw(4) << registers.pc);
+    FunctionBuilder fb(mpu_, ct_memory, code_at_address_, registers.pc);
+    boost::shared_ptr<Function> f(fb.build());
+    add_function(f);
+    return f.get();
+}
+
+Function *FunctionManager::build_function(uint16_t address, 
+                                          const uint8_t *ct_memory)
+{
+    Function *f;
+    int pass = 0;
+    do
+    {
+        assert(pass < 2);
+        ++pass;
+
+        f = build_function_internal(address, ct_memory);
+
+        bool f_is_optimistic_self_writer = false;
+        const AddressSet &code_range = f->code_range();
+        for (AddressSet::const_iterator it = code_range.begin();
+             it != code_range.end(); ++it)
+        {
+            uint16_t i = *it;
+            if (code_at_address_[i] && 
+                !optimistic_writers_for_address_[i].empty())
+            {
+                // There is now code at an address where optimistic writes are
+                // performed. Future code generation won't create optimistic
+                // writes there because code_at_address_[i] has now been set,
+                // but we need to destroy existing functions which perform
+                // that write so they will be regenerated.
+                const FunctionSet &optimistic_writers = 
+                    optimistic_writers_for_address_[i];
+                f_is_optimistic_self_writer = 
+                    (optimistic_writers.find(f) != optimistic_writers.end());
+                destroy_functions_in_set(optimistic_writers_for_address_[i]);
+                if (f_is_optimistic_self_writer)
+                {
+                    // destroy_functions_in_set() has now destroyed f, so a)
+                    // code_range is no longer a valid reference b) there's
+                    // no need to continue iterating over f's code range.
+                    break;
+                }
+
+            }
+        }
+
+        // We might just have destroyed the function we built, if it modified
+        // its own code, so we need to loop round if so.
+        f = function_for_address_[address];
+        if (f == 0)
+        {
+            assert(f_is_optimistic_self_writer);
+            TRACE("Rebuilding just-created function");
+        }
+    }
+    while (f == 0);
+
+    TRACE(f->dump_all());
+
+    return f;
+}
+
+void FunctionManager::build_function_lazy(uint16_t address)
+{
+    assert(jit_thread_idle());
+
+    TRACE("Will build Function for address 0x" << std::hex << 
+          std::setfill('0') << std::setw(4) << address << " in background");
+
+    // We only create the JIT thread the first time it's needed; this avoids it
+    // existing if the library is being used in interpreted or compiled mode.
+    if (jit_thread_.get_id() == boost::thread::id())
+    {
+        TRACE("Creating JIT thread");
+        boost::thread t(
+            std::mem_fun(&FunctionManager::build_function_thread), this);
+        jit_thread_.swap(t);
+    }
+
+    {
+        boost::mutex::scoped_lock lock(jit_thread_idle_mutex_);
+        jit_thread_idle_ = false;
+    }
+    {
+        boost::mutex::scoped_lock lock(jit_thread_cv_mutex_);
+        work_available_ = true;
+        jit_thread_address_ = address;
+    }
+    jit_thread_cv_.notify_all();
+}
+
+void FunctionManager::build_function_thread()
+{
+    try
+    {
+        TRACE("JIT thread started");
+        boost::mutex::scoped_lock jit_thread_cv_mutex_lock(
+            jit_thread_cv_mutex_);
+        while (true)
+        {
+            while (!quit_ && !work_available_)
+            {
+                TRACE("JIT thread waiting to be signalled");
+                jit_thread_cv_.wait(jit_thread_cv_mutex_lock);
+            }
+
+            if (quit_)
+            {
+                TRACE("JIT thread quitting");
+                return;
+            }
+            else
+            {
+                TRACE("JIT thread about to build Function at address 0x" <<
+                      std::hex << std::setfill('0') << std::setw(4) << 
+                      jit_thread_address_);
+                assert(work_available_);
+                assert(!jit_thread_idle_);
+
+                // Note that we translate code from memory_snapshot_
+                // not mpu_->memory. This is important, even though we
+                // have update_memory_snapshot() which "should" invalidate
+                // Function objects which depend on modified code before any
+                // of them are used. The reason is that if a memory location
+                // is temporarily modified by the interpreter before it can
+                // be translated, then modified back to its original value
+                // by the interpreter before update_memory_snapshot() is
+                // called, update_memory_snapshot() can't notice the change,
+                // but the change has been compiled into the Function object.
+                // (See test/z-self-modify-2.xa; this breaks in hybrid mode
+                // if memory_snapshot_ isn't used here.)
+                build_function(jit_thread_address_, memory_snapshot_);
+                work_available_ = false;
+
+                boost::mutex::scoped_lock jit_thread_idle_lock(
+                    jit_thread_idle_mutex_);
+                jit_thread_idle_ = true;
+            }
+        }
+    }
+    catch (std::exception &e)
+    {
+        die(e.what());
+    }
+}
+
+void FunctionManager::add_function(const boost::shared_ptr<Function> &f)
+{
+    function_for_address_[f->address()] = f.get();
+    function_for_address_owner_[f->address()] = f;
+
+    const AddressSet &code_range = f->code_range();
+    for (AddressSet::const_iterator it = code_range.begin(); 
+         it != code_range.end(); ++it)
+    {
+        uint16_t i = *it;
+        functions_covering_address_[i].insert(f.get());
+        code_at_address_[i] = true;
+    }
+
+    const AddressSet &optimistic_writes = f->optimistic_writes();
+    for (AddressSet::const_iterator it = optimistic_writes.begin();
+         it != optimistic_writes.end(); ++it)
+    {
+        uint16_t i = *it;
+        optimistic_writers_for_address_[i].insert(f.get());
+    }
+}
+
+void FunctionManager::code_modified_at(uint16_t address)
+{
+    // We could just return immediately if code_at_address_[address] is false;
+    // sometimes we call this function without bothering to check first.
+    // In practice I doubt this has a significant impact on performance.
+
+    TRACE("Code modified at 0x" << std::hex << std::setfill('0') << 
+          std::setw(4) << address);
+
+    destroy_functions_in_set(functions_covering_address_[address]);
+
+    // Keep memory_snapshot_ up-to-date; this avoids harmless-but-inefficient
+    // destruction of perfectly valid Function objects when
+    // update_memory_snapshot() is called next.
+    memory_snapshot_[address] = mpu_->memory[address];
+}
+
+void FunctionManager::destroy_functions_in_set(FunctionSet &function_set)
+{
+    // We iterate over the set like this because destroy_function() will erase
+    // the function from function_set, thereby invalidating any iterator we are
+    // holding on to.
+    while (!function_set.empty())
+    {
+        destroy_function(*function_set.begin());
+    }
+}
+
+void FunctionManager::destroy_function(Function *f)
+{
+    const AddressSet &code_range = f->code_range();
+    for (AddressSet::const_iterator it = code_range.begin(); 
+         it != code_range.end(); ++it)
+    {
+        uint16_t i = *it;
+        size_t erased_count = functions_covering_address_[i].erase(f);
+        ASSERT_EQUAL(erased_count, 1);
+        // We do *not* clear code_at_address_[i] even if
+        // functions_covering_address_[i] is now empty; this records the fact
+        // that we have executed code at this address. This is critical for
+        // the current implementation of build_function(); code_at_address_
+        // being set is used to control optimistic vs non-optimistic writes,
+        // and if code_at_address_ was cleared when a function was destroyed
+        // a self-modifying function would cause an infinite loop inside
+        // build_function(). It would be OK to clear code_at_address_ for any
+        // addresses with empty functions_covering_address_ sets at the end
+        // of build_function(), but we currently don't.
+    }
+
+    const AddressSet &optimistic_writes = f->optimistic_writes();
+    for (AddressSet::const_iterator it = optimistic_writes.begin();
+         it != optimistic_writes.end(); ++it)
+    {
+        uint16_t i = *it;
+        size_t erased_count = optimistic_writers_for_address_[i].erase(f);
+        ASSERT_EQUAL(erased_count, 1);
+    }
+
+    assert(function_for_address_[f->address()] == f);
+    function_for_address_[f->address()] = 0;
+    // Do this last as it will cause the Function object to be deleted.
+    assert(function_for_address_owner_[f->address()].get() == f);
+    function_for_address_owner_[f->address()].reset();
+}
diff --git a/FunctionManager.h b/FunctionManager.h
new file mode 100644
index 0000000..141fe7a
--- /dev/null
+++ b/FunctionManager.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef FUNCTIONMANAGER_H
+#define FUNCTIONMANAGER_H
+
+#include <assert.h>
+#include <boost/shared_ptr.hpp>
+#include <boost/thread/condition_variable.hpp>
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/utility.hpp>
+#include <set>
+#include <stdint.h>
+
+#include "const.h"
+#include "JitBool.h"
+#include "lib6502.h"
+
+class Function;
+
+class FunctionManager : boost::noncopyable
+{
+public:
+    FunctionManager(M6502 *mpu);
+    ~FunctionManager();
+
+    bool jit_thread_idle();
+
+    void update_memory_snapshot();
+
+    // Return a Function object representing the code starting at 'address'; if
+    // one does not already exist it will be created. This never returns null.
+    Function *get_function(uint16_t address)
+    {
+        Function *f = function_for_address_[address];
+        if (f != 0)
+        {
+            return f;
+        }
+        else
+        {
+            return build_function(address, mpu_->memory);
+        }
+    }
+
+    // Return a Function object representing the code starting at 'address',
+    // if one is available, otherwise return null. When null is returned
+    // a background thread may be used to generate a Function object which
+    // can be returned if the request is repeated in the future.
+    //
+    // This function may only be called if the last call to jit_thread_idle()
+    // returned true and no call has been made to get_function_lazy() since
+    // jit_thread_idle() was called.
+    //
+    // Currently a background thread will *always* be invoked if null is
+    // returned, but this is not guaranteed. For example, we may wish to
+    // refuse to waste time building a Function object which we expect to
+    // be invalidated by self-modifying code shortly afterwards.
+    Function *get_function_lazy(uint16_t address)
+    {
+        // This assert() is perfectly correct, but it single-handedly destroys
+        // the performance of a debug build; it's just not *that* valuable.
+        // assert(jit_thread_idle());
+
+        Function *f = function_for_address_[address];
+        if (f != 0)
+        {
+            return f;
+        }
+        else
+        {
+            build_function_lazy(address);
+            return 0;
+        }
+    }
+
+    void code_modified_at(uint16_t address);
+
+private:
+    void add_function(const boost::shared_ptr<Function> &f);
+
+    Function *build_function(uint16_t address, const uint8_t *ct_memory);
+    Function *build_function_internal(uint16_t address, 
+                                      const uint8_t *ct_memory);
+
+    void build_function_lazy(uint16_t address);
+    void build_function_thread();
+
+    typedef std::set<Function *> FunctionSet;
+    void destroy_functions_in_set(FunctionSet &function_set);
+
+    void destroy_function(Function *f);
+
+    boost::thread jit_thread_;
+
+    boost::mutex jit_thread_idle_mutex_;
+    bool jit_thread_idle_;
+
+    boost::mutex jit_thread_cv_mutex_;
+    boost::condition_variable jit_thread_cv_;
+    bool work_available_;
+    uint16_t jit_thread_address_;
+    bool quit_;
+
+    M6502 *mpu_;
+
+    // A copy of the emulated CPU's memory, used to detect changes to already
+    // JITted code which happen in callbacks and to avoid problems with JITting
+    // while the interpreter is running (in hybrid mode).
+    uint8_t memory_snapshot_[memory_size];
+
+    // We maintain this array of shared_ptr's which actually own the
+    // Function objects.
+    boost::shared_ptr<Function> function_for_address_owner_[memory_size];
+
+    // We maintain a parallel array of raw pointers here so that we have
+    // the option to allow JITted code to access it.
+    Function *function_for_address_[memory_size];
+
+    // This tracks the Function objects which contain code generated based on
+    // individual addresses, i.e. the Function objects which are invalidated by
+    // a store to a given memory location.
+    FunctionSet functions_covering_address_[memory_size];
+
+    // This tracks the Function objects which perform optimistic writes to
+    // individual addresses, i.e. the Function objects which are invalidated if
+    // it turns out an address is in fact used to hold code.
+    FunctionSet optimistic_writers_for_address_[memory_size];
+
+    // This tracks whether we have ever executed code at a given address;
+    // destroying all the functions in the corresponding element of
+    // functions_covering_address does *not* mean this is cleared.
+    JitBool code_at_address_[memory_size];
+};
+
+#endif
diff --git a/JitBool.h b/JitBool.h
new file mode 100644
index 0000000..818008e
--- /dev/null
+++ b/JitBool.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+// JitBool is a typedef representing the type used for boolean flags in the
+// JITted code, i.e. the CPU flag values and the 'code modified at' flag for
+// each memory address. In reality this is not likely to change, but this at
+// least helps to identify code which needs to change to support a different
+// representation. FunctionBuilder.cpp also contains a number of helper
+// functions which depend on the underlying type of JitBool.
+
+#ifndef JITBOOL_H
+#define JITBOOL_H
+
+typedef uint8_t JitBool;
+const JitBool jit_bool_false = 0;
+const JitBool jit_bool_true = 1;
+
+#endif
diff --git a/LLVMStuff.cpp b/LLVMStuff.cpp
new file mode 100644
index 0000000..faebdf8
--- /dev/null
+++ b/LLVMStuff.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "LLVMStuff.h"
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/TargetSelect.h"
+
+LLVMStuff::LLVMStuff()
+: module_(new llvm::Module("lib6502-jit", llvm::getGlobalContext())),
+  builder_(llvm::getGlobalContext())
+{
+    llvm::InitializeNativeTarget();
+
+    std::string error;
+    execution_engine_ = 
+        llvm::EngineBuilder(module_.get()).setErrorStr(&error).create();
+    if (execution_engine_ == 0)
+    {
+        throw std::runtime_error("Could not create LLVM ExecutionEngine: " + 
+                                 error);
+    }
+}
+
+LLVMStuff::~LLVMStuff()
+{
+}
diff --git a/LLVMStuff.h b/LLVMStuff.h
new file mode 100644
index 0000000..7ba9d31
--- /dev/null
+++ b/LLVMStuff.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef LLVMSTUFF_H
+#define LLVMSTUFF_H
+
+#include <boost/shared_ptr.hpp>
+#include <boost/utility.hpp>
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include <stdexcept>
+
+struct LLVMStuff : boost::noncopyable
+{
+    LLVMStuff();
+    ~LLVMStuff();
+
+    llvm::ExecutionEngine *execution_engine_;
+    boost::shared_ptr<llvm::Module> module_;
+    llvm::IRBuilder<> builder_;
+
+};
+
+#endif
diff --git a/M6502Internal.h b/M6502Internal.h
new file mode 100644
index 0000000..c54131c
--- /dev/null
+++ b/M6502Internal.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef M6502INTERNAL_H
+#define M6502INTERNAL_H
+
+#include "FunctionManager.h"
+#include "lib6502.h"
+#include "LLVMStuff.h"
+#include "Registers.h"
+
+struct _M6502_Internal                                                           
+{                                                                                
+    _M6502_Internal(M6502 *mpu)
+    : function_manager_(mpu), mode_(M6502_ModeHybrid), 
+      max_instructions_(default_max_instructions_)
+    {
+    }
+
+    Registers registers_;                                                        
+    LLVMStuff llvm_stuff_;                                                       
+    FunctionManager function_manager_;                                           
+
+    M6502_Mode mode_;
+    static const int default_max_instructions_ = 500;
+    int max_instructions_;
+};                                                                               
+
+#endif
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..879e06d
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,130 @@
+ACLOCAL_AMFLAGS = -I m4
+AM_CPPFLAGS = `$(LLVMCONFIG) --cppflags` $(BOOST_CPPFLAGS)
+# lib6502.c generates spurious warnings with -Wall, so we want -Wno-parentheses
+# too. It's not easy to have per-source-file build flags in automake, so we
+# just apply this to all C files.
+AM_CFLAGS = -Wall -Wno-parentheses
+AM_CXXFLAGS = `$(LLVMCONFIG) --cxxflags` -fexceptions -Wall
+AM_LDFLAGS = $(BOOST_THREAD_LDFLAGS)
+LIBS = `$(LLVMCONFIG) --ldflags --libs core jit native --system-libs` $(BOOST_THREAD_LIBS)
+
+# Some of these are included automatically, but I'd rather be explicit.
+EXTRA_DIST = \
+	examples/README \
+	COPYING \
+	CREDITS \
+	lib6502-compatibility.txt \
+	README \
+	README.lib6502 \
+	TODO \
+	man/* \
+	test/*.xa \
+	test/*.mst \
+	test/run-c-tests.sh \
+	test/run-run6502-tests.sh \
+	test/run-c-tests.py \
+	test/run-run6502-tests.py
+
+man1_MANS = man/*.1
+man3_MANS = man/*.3
+lib_LTLIBRARIES = lib6502-jit.la
+include_HEADERS = lib6502.h
+bin_PROGRAMS = run6502
+noinst_PROGRAMS = \
+	examples/lib1
+check_PROGRAMS = \
+	test/basic-callback \
+	test/call-illegal-callback-modify-code \
+	test/irq-nmi \
+	test/setjmp-trick \
+	test/stack-code-brk \
+	test/stack-code-jsr \
+	test/write-callback-modify-code
+
+lib6502_jit_la_SOURCES = \
+	AddressRange.cpp \
+	AddressRange.h \
+	AddressSet.cpp \
+	AddressSet.h \
+	const.h \
+	Function.cpp \
+	Function.h \
+	FunctionBuilder.cpp \
+	FunctionBuilder.h \
+	FunctionManager.cpp \
+	FunctionManager.h \
+	JitBool.h \
+	lib6502.c \
+	lib6502.h \
+	lib6502-jit.cpp \
+	LLVMStuff.cpp \
+	LLVMStuff.h \
+	M6502Internal.h \
+	Registers.cpp \
+	Registers.h \
+	util.cpp \
+	util.h \
+	valgrind.h
+
+run6502_SOURCES = \
+	run6502.c
+run6502_LINK = $(CXXLINK)
+run6502_LDADD = lib6502-jit.la
+
+examples_lib1_SOURCES = \
+	examples/lib1.c
+examples_lib1_LINK = $(CXXLINK)
+examples_lib1_LDADD = lib6502-jit.la
+
+test_basic_callback_SOURCES = \
+	test/basic-callback.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_basic_callback_LINK = $(CXXLINK)
+test_basic_callback_LDADD = lib6502-jit.la
+
+test_call_illegal_callback_modify_code_SOURCES = \
+	test/call-illegal-callback-modify-code.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_call_illegal_callback_modify_code_LINK = $(CXXLINK)
+test_call_illegal_callback_modify_code_LDADD = lib6502-jit.la
+
+test_irq_nmi_SOURCES = \
+	test/irq-nmi.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_irq_nmi_LINK = $(CXXLINK)
+test_irq_nmi_LDADD = lib6502-jit.la
+
+test_setjmp_trick_SOURCES = \
+	test/setjmp-trick.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_setjmp_trick_LINK = $(CXXLINK)
+test_setjmp_trick_LDADD = lib6502-jit.la
+
+test_stack_code_brk_SOURCES = \
+	test/stack-code-brk.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_stack_code_brk_LINK = $(CXXLINK)
+test_stack_code_brk_LDADD = lib6502-jit.la
+
+test_stack_code_jsr_SOURCES = \
+	test/stack-code-jsr.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_stack_code_jsr_LINK = $(CXXLINK)
+test_stack_code_jsr_LDADD = lib6502-jit.la
+
+test_write_callback_modify_code_SOURCES = \
+	test/write-callback-modify-code.c \
+	test/test-utils.c \
+	test/test-utils.h
+test_write_callback_modify_code_LINK = $(CXXLINK)
+test_write_callback_modify_code_LDADD = lib6502-jit.la
+
+TESTS = \
+	test/run-c-tests.sh \
+	test/run-run6502-tests.sh
diff --git a/README b/README
new file mode 100644
index 0000000..0620f77
--- /dev/null
+++ b/README
@@ -0,0 +1,84 @@
+lib6502-jit is a (mostly) compatible implementation of Ian Piumarta's lib6502
+which uses LLVM to perform JIT compilation of 6502 machine code to host code.
+This will doubtless be useful to the large community of people stuck doing
+number-crunching tasks with legacy 6502 code. :-)
+
+README.lib6502 is a copy of the original lib6502 README. You should probably go
+and read that before reading any further.
+
+lib6502-compatibility.txt documents the differences between lib6502 and
+lib6502-jit.
+
+CREDITS contains acknowledgements of the various people and groups on whose
+work lib6502-jit is built.
+
+COPYING contains license details for lib6502-jit.
+
+TODO contains some notes on possible enhancements to lib6502-jit.
+
+How to build:
+
+You'll need the following installed:
+- a C/C++ compiler (I've tested with gcc 4.7.2, gcc 4.8.2 and clang 3.5)
+- LLVM development libraries (I've tested with various 3.5 pre-release snapshots)
+- boost (including boost::thread) (I've tested with 1.49, 1.54 and 1.55)
+
+I have somewhat reluctantly set up an autotools build system; compiling and
+linking against LLVM and boost::thread on different platforms was otherwise
+just that bit too fiddly. So in theory all you need to do is:
+
+    ./configure
+    make
+
+I suggest you actually do:
+    CFLAGS='-g -O3' CXXFLAGS='-g -O3' ./configure
+to increase the optimisation level. (I would have made that the default, but
+apparently that would go against user expectations for an autotools build
+system.)
+
+"make install" should work as well if you feel inclined to do so, but it's not
+necessary.
+
+I've tested on three platforms, and for what it's worth here are more detailed
+instructions for those:
+
+Ubuntu (14.04 x86):
+    apt-get install libboost-dev libboost-thread-dev llvm-3.5-dev libedit-dev
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure --with-llvm-config=llvm-config-3.5
+    make
+
+Debian (7.5 x86-64):
+    apt-get install libboost-dev libboost-thread-dev 
+    [I used the llvm-3.5-dev package from the wheezy repository here: http://llvm.org/apt/]
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure
+    make
+
+FreeBSD (10.0-RELEASE x86-64):
+    pkg install boost-all-1.55.0
+    pkg install llvm-devel-3.5.r203994
+    export CFLAGS='-g -O3' 
+    export CXXFLAGS='-g -O3' 
+    ./configure --with-llvm-config=/usr/local/llvm-devel/bin/llvm-config
+    make
+
+There are some tests which will run if you type "make check". Some will be
+skipped unless you have the "xa" assembler
+(http://www.floodgap.com/retrotech/xa/) on your PATH.
+
+The above assumes you downloaded a lib6502-jit*tar.bz2 package, which will
+contain a "configure" script. This is not (following what I understand to be
+best practice) checked into source control, so if you downloaded the source
+using something like git or svn, you need to either:
+- download the tarball - it will be much easier, especially if you're just
+  taking a quick look at lib6502-jit and don't plan to make changes to the code
+  (yet)
+- install autoconf, automake and libtool, then cross your fingers and run
+  "autoreconf -i", which will generate a "configure" script for you if you're
+  lucky.
+
+If you have any queries, comments or bug reports, please drop me (Steven
+Flintham) an e-mail at lib6502-jit@lemma.co.uk.
diff --git a/README.lib6502 b/README.lib6502
new file mode 100644
index 0000000..b79e595
--- /dev/null
+++ b/README.lib6502
@@ -0,0 +1,136 @@
+		lib6502 - 6502 Microprocessor Emulator
+
+			Version: 1.0
+
+
+WHAT IF I'M TOO LAZY TO READ 'README'S?
+
+	make
+	make install
+	more examples/README
+
+
+WHAT IS LIB6502?
+
+  lib6502 is a library that emulates the 6502 microprocessor.  It
+  comes with a small 'shell', run6502, that can execute 6502 programs
+  from the command line.
+
+  lib6502 is distributed under the MIT license: it is non-infectious
+  and will not make your projects contagious to others the instant you
+  choose to use lib6502 in them.  See the file COPYING for details.
+
+
+WHERE IS THE LATEST SOURCE CODE?
+
+  Source code for lib6502 is available from the author's home page at
+  'http://piumarta.com/software'.  You can download the most recent
+  release or use Subversion to get the very latest sources.
+
+
+WHERE IS THE DOCUMENTATION?
+
+  Manual pages for run6502 and lib6502 (and all the functions it
+  exports) should be available once it is installed.  Each includes a
+  short 'examples' section.  Use the 'man' command to read them.
+
+  Your best place to start looking for documentation on the 6502
+  itself is 'http://6502.org'.  A google search of the web will also
+  turn up vast quantities of information about (and programs for) the
+  6502.
+
+
+HOW DO I INSTALL IT?
+
+  It's not really big enough to warrant the whole 'configure' thing.
+  Any system with an ANSI compiler and C library should be able to
+  compile it out of the box.  After unpacking the archive, just type:
+
+	make
+
+  to build it.  If the compiler blows up immediately, edit the
+  Makefile and play with the '-g' and '-O' flags and then try again.
+  If you really can't make the compiler happy you've found a bug (read
+  the next section but one).  Otherwise, if you want it put it
+  somewhere more permanent then type:
+
+	make install
+
+  (as root) to install it.  It goes into /usr/local by default; if you
+  want it elsewhere then set PREFIX in the make command.  For example:
+
+	make install PREFIX=/usr
+
+  will put everything under '/usr'.
+
+  When you get bored with it, go back to the source directory and
+  type:
+
+	make uninstall
+
+  (with the same PREFIX you specified during the install, if
+  necessary.)
+
+
+WHAT CAN I DO WITH IT?
+
+  See the file EXAMPLES for some suggestions (all of them polite).
+
+  If that leaves you wanting more, read the source for run6502 -- it
+  exercises just about every feature in lib6502.
+
+
+HOW DO I REPORT PROBLEMS?^W^WCONTACT THE ORIGINAL AUTHOR?
+
+  [If you wish to get in touch with the author of lib6502, this is the
+  address to use. Since lib6502-jit is based on lib6502 but has been
+  heavily modified, please do *not* report problems to this address;
+  use the address in README instead. -- Steve]
+
+  Send e-mail to the author at: firstName (at) lastName (dot) com
+
+  (For suitable values of firstName and lastName, see the last section
+  of this file.)
+
+  If you're still confused, contact him at: http://piumarta.com
+
+
+HOW CAN I HELP?
+
+  Use it.  Find bugs.  Fix bugs.  Make it faster.  Evangelism: spread
+  it to as many other projects as possible, especially those that
+  might be using a slower emulator!  Read the manual pages to see
+  what's considered missing, then add it, then send it in.
+
+  (One thing that would be be really handy, and isn't mentioned in the
+  manual pages, is a test suite.  Figure out how to test every mode in
+  every instruction with every possible combination of operand values
+  and condition codes and verify the behaviour is correct.  Then write
+  it down in the form of a program and send it in.  If it's a
+  self-contained program that runs once to completion then we can
+  probably find some real hardware to test against the test suite.)
+
+  If you know how to write software that emulates peripheral hardware
+  devices, google up some details on the popular 6502-based
+  microcomputers (Acorn, Commodore, etc.) and add some serious system
+  emulation to run6502.  Make it all pluggable (think dynamic
+  libraries over an 'agnostic' core), so we can change machines at the
+  flip of a (command-line) switch.  (The callback mechanism in lib6502
+  was designed with this kind of 'pluggable hardware emulation' in
+  mind.)
+
+
+WHO WROTE THIS STUFF, AND WHY?
+
+  lib6502 was written by Ian Piumarta.
+
+  While writing ccg (an entirely different project that creates
+  runtime assemblers for dynamic code generators) he decided to
+  include support for an 8-bit microprocessor, just for fun.  He chose
+  the 6502 because it was used in the first computer he owned and
+  programmed (an Ohio Scientific Superboard II, when he was 14) as
+  well as the second (an Acorn 'BBC Model B', about four years later).
+  lib6502 started as a 'glorified switch statement' that ran some
+  small test programs spewed into memory by ccg, but rapidly got out
+  of control over the course of a weekend.  You're looking at the
+  result.
diff --git a/Registers.cpp b/Registers.cpp
new file mode 100644
index 0000000..7070557
--- /dev/null
+++ b/Registers.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "Registers.h"
+
+#include "const.h"
+#include "lib6502.h"
+#include "M6502Internal.h"
+
+void Registers::to_M6502_Registers(M6502 *mpu) const
+{
+    M6502_Registers &er = *(mpu->registers);
+    Registers &ir = mpu->internal->registers_;
+
+    er.a = ir.a;
+    er.x = ir.x;
+    er.y = ir.y;
+    er.s = ir.s;
+    er.p = 0;
+    if (ir.flag_n) er.p |= flagN;
+    if (ir.flag_v) er.p |= flagV;
+    if (ir.flag_d) er.p |= flagD;
+    if (ir.flag_i) er.p |= flagI;
+    if (ir.flag_z) er.p |= flagZ;
+    if (ir.flag_c) er.p |= flagC;
+    er.pc = ir.pc;
+}
+
+void Registers::from_M6502_Registers(const M6502 *mpu)
+{
+    M6502_Registers &er = *(mpu->registers);
+    Registers &ir = mpu->internal->registers_;
+
+    ir.a = er.a;
+    ir.x = er.x;
+    ir.y = er.y;
+    ir.s = er.s;
+    ir.flag_n = ((er.p & flagN) != 0);
+    ir.flag_v = ((er.p & flagV) != 0);
+    ir.flag_d = ((er.p & flagD) != 0);
+    ir.flag_i = ((er.p & flagI) != 0);
+    ir.flag_z = ((er.p & flagZ) != 0);
+    ir.flag_c = ((er.p & flagC) != 0);
+    ir.pc = er.pc;
+}
diff --git a/Registers.h b/Registers.h
new file mode 100644
index 0000000..467065a
--- /dev/null
+++ b/Registers.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef REGISTERS_H 
+#define REGISTERS_H
+
+#include <boost/utility.hpp>
+#include <stdint.h>
+
+#include "JitBool.h"
+
+typedef struct _M6502 M6502;
+
+struct Registers : boost::noncopyable
+{
+    uint8_t a;
+    uint8_t x;
+    uint8_t y;
+    uint8_t s;
+    JitBool flag_n;
+    JitBool flag_v;
+    JitBool flag_d;
+    JitBool flag_i;
+    JitBool flag_z;
+    JitBool flag_c;
+    uint16_t pc;
+
+    // Pseudo-registers used to communicate state for callbacks; see the
+    // comment describing the Result enumeration in FunctionBuilder.h.
+    uint16_t addr;
+    uint8_t data;
+
+    void to_M6502_Registers(M6502 *mpu) const;
+    void from_M6502_Registers(const M6502 *mpu);
+};
+
+#endif
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..d57ecb6
--- /dev/null
+++ b/TODO
@@ -0,0 +1,67 @@
+It would be interesting to see if this works OK on an ARM machine.
+
+
+Running e.g. z-self-modify-1 to completion in -mc -mx 1 mode shows the memory
+for the run6502 process grows steadily, but valgrind doesn't show any leaks. A
+quick web search suggests this might be internal leaks in LLVM (which are only
+exposed by things like this which continually JIT). I am inclined to leave this
+and perhaps come back to it once LLVM 3.5 is actuallly released; if there's
+still a problem then it might be worth tracking it down.
+
+
+Would it be helpful to pass branch weights to CreateCondBr()? For example,
+where we have a computed address which might trigger a read/write callback, we
+could calculate the proportion of addresses in the address range which have
+callbacks on them and use that as the probability of taking the callback-exists
+branch.
+
+
+We could potentially use Function objects to deduce properties of stretches of
+code and use that information to improve the generated code. For example, if we
+observed that a Function object didn't contain any external calls or any
+stack-modification instructions except RTS then we could inline it in any
+callers (adding its code ranges to their code ranges, of course) and the RTS
+could be a no-op. (For 100% accuracy, the JSR should still push the return
+address on the stack but not modify the stack pointer. Code executed later on
+might peek at the stack and expect those values to be there.) This might in
+turn allow the callers of that Function to be inlined themselves. This is just
+an example. It may be that in practice deciding when to re-translate code would
+cause a sufficient performance impact to just not be worth it in the first
+place.
+
+
+We could add support for counting the number of cycles executed by the JITted
+code; lib6502 itself has some support for this in the form of the tick* macros,
+but they don't do anything by default.
+
+
+Would there be any performance improvement to be had by having Function objects
+(tail) call one another where possible?
+
+
+Hybrid mode currently makes no attempt to avoid re-generating Function objects
+which are continually being invalidated due to self-modifying code. It might be
+nice if some heuristic caused us to avoid this unnecessary work and just let
+the interpreter always handle that code.
+
+On a related but distinct note, currently once an element of
+FunctionManager::code_at_address_ is set, it is never cleared. This might cause
+us to avoid optimistic writes which in reality would be OK. We could use some
+heuristic to decide when to destroy Function objects which have not been
+executed in a long time, and start clearing code_at_address_ elements when all
+functions covering an address are removed. (See the note in
+FunctionManager::destroyFunction(); this clearing must be done *outside* the
+loop in FunctionManager::buildFunction(), or the implementation of
+buildFunction() must be tweaked.)
+
+However, it may be that it just isn't worth being that clever. Any such code
+would need to be triggered inside the main loop between executions of Function
+objects. We could do it only every nth time, and keeping track of how many
+times we've been round probably wouldn't significantly harm performance, but be
+careful.
+
+
+Would a different default value for max_instructions be better?
+
+
+Are there any other LLVM optimisation passes which would be helpful?
diff --git a/build-aux/tap-driver.sh b/build-aux/tap-driver.sh
new file mode 100755
index 0000000..c011298
--- /dev/null
+++ b/build-aux/tap-driver.sh
@@ -0,0 +1,649 @@
+#! /bin/sh
+# Copyright (C) 2011 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+scriptversion=2011-12-27.17; # UTC
+
+# Make unconditional expansion of undefined variables an error.  This
+# helps a lot in preventing typo-related bugs.
+set -u
+
+me=tap-driver.sh
+
+fatal ()
+{
+  echo "$me: fatal: $*" >&2
+  exit 1
+}
+
+usage_error ()
+{
+  echo "$me: $*" >&2
+  print_usage >&2
+  exit 2
+}
+
+print_usage ()
+{
+  cat <<END
+Usage:
+  tap-driver.sh --test-name=NAME --log-file=PATH --trs-file=PATH
+                [--expect-failure={yes|no}] [--color-tests={yes|no}]
+                [--enable-hard-errors={yes|no}] [--ignore-exit]
+                [--diagnostic-string=STRING] [--merge|--no-merge]
+                [--comments|--no-comments] [--] TEST-COMMAND
+The \`--test-name', \`--log-file' and \`--trs-file' options are mandatory.
+END
+}
+
+# TODO: better error handling in option parsing (in particular, ensure
+# TODO: $log_file, $trs_file and $test_name are defined).
+test_name= # Used for reporting.
+log_file=  # Where to save the result and output of the test script.
+trs_file=  # Where to save the metadata of the test run.
+expect_failure=0
+color_tests=0
+merge=0
+ignore_exit=0
+comments=0
+diag_string='#'
+while test $# -gt 0; do
+  case $1 in
+  --help) print_usage; exit $?;;
+  --version) echo "$me $scriptversion"; exit $?;;
+  --test-name) test_name=$2; shift;;
+  --log-file) log_file=$2; shift;;
+  --trs-file) trs_file=$2; shift;;
+  --color-tests) color_tests=$2; shift;;
+  --expect-failure) expect_failure=$2; shift;;
+  --enable-hard-errors) shift;; # No-op.
+  --merge) merge=1;;
+  --no-merge) merge=0;;
+  --ignore-exit) ignore_exit=1;;
+  --comments) comments=1;;
+  --no-comments) comments=0;;
+  --diagnostic-string) diag_string=$2; shift;;
+  --) shift; break;;
+  -*) usage_error "invalid option: '$1'";;
+  esac
+  shift
+done
+
+test $# -gt 0 || usage_error "missing test command"
+
+case $expect_failure in
+  yes) expect_failure=1;;
+    *) expect_failure=0;;
+esac
+
+if test $color_tests = yes; then
+  init_colors='
+    color_map["red"]="[0;31m" # Red.
+    color_map["grn"]="[0;32m" # Green.
+    color_map["lgn"]="[1;32m" # Light green.
+    color_map["blu"]="[1;34m" # Blue.
+    color_map["mgn"]="[0;35m" # Magenta.
+    color_map["std"]="[m"     # No color.
+    color_for_result["ERROR"] = "mgn"
+    color_for_result["PASS"]  = "grn"
+    color_for_result["XPASS"] = "red"
+    color_for_result["FAIL"]  = "red"
+    color_for_result["XFAIL"] = "lgn"
+    color_for_result["SKIP"]  = "blu"'
+else
+  init_colors=''
+fi
+
+{
+  (
+    # Ignore common signals (in this subshell only!), to avoid potential
+    # problems with Korn shells.  Some Korn shells are known to propagate
+    # to themselves signals that have killed a child process they were
+    # waiting for; this is done at least for SIGINT (and usually only for
+    # it, in truth).  Without the `trap' below, such a behaviour could
+    # cause a premature exit in the current subshell, e.g., in case the
+    # test command it runs gets terminated by a SIGINT.  Thus, the awk
+    # script we are piping into would never seen the exit status it
+    # expects on its last input line (which is displayed below by the
+    # last `echo $?' statement), and would thus die reporting an internal
+    # error.
+    # For more information, see the Autoconf manual and the threads:
+    # <http://lists.gnu.org/archive/html/bug-autoconf/2011-09/msg00004.html>
+    # <http://mail.opensolaris.org/pipermail/ksh93-integration-discuss/2009-February/004121.html>
+    trap : 1 3 2 13 15
+    if test $merge -gt 0; then
+      exec 2>&1
+    else
+      exec 2>&3
+    fi
+    "$@"
+    echo $?
+  ) | LC_ALL=C ${AM_TAP_AWK-awk} \
+        -v me="$me" \
+        -v test_script_name="$test_name" \
+        -v log_file="$log_file" \
+        -v trs_file="$trs_file" \
+        -v expect_failure="$expect_failure" \
+        -v merge="$merge" \
+        -v ignore_exit="$ignore_exit" \
+        -v comments="$comments" \
+        -v diag_string="$diag_string" \
+'
+# FIXME: the usages of "cat >&3" below could be optimized when using
+# FIXME: GNU awk, and/on on systems that supports /dev/fd/.
+
+# Implementation note: in what follows, `result_obj` will be an
+# associative array that (partly) simulates a TAP result object
+# from the `TAP::Parser` perl module.
+
+## ----------- ##
+##  FUNCTIONS  ##
+## ----------- ##
+
+function fatal(msg)
+{
+  print me ": " msg | "cat >&2"
+  exit 1
+}
+
+function abort(where)
+{
+  fatal("internal error " where)
+}
+
+# Convert a boolean to a "yes"/"no" string.
+function yn(bool)
+{
+  return bool ? "yes" : "no";
+}
+
+function add_test_result(result)
+{
+  if (!test_results_index)
+    test_results_index = 0
+  test_results_list[test_results_index] = result
+  test_results_index += 1
+  test_results_seen[result] = 1;
+}
+
+# Whether the test script should be re-run by "make recheck".
+function must_recheck()
+{
+  for (k in test_results_seen)
+    if (k != "XFAIL" && k != "PASS" && k != "SKIP")
+      return 1
+  return 0
+}
+
+# Whether the content of the log file associated to this test should
+# be copied into the "global" test-suite.log.
+function copy_in_global_log()
+{
+  for (k in test_results_seen)
+    if (k != "PASS")
+      return 1
+  return 0
+}
+
+# FIXME: this can certainly be improved ...
+function get_global_test_result()
+{
+    if ("ERROR" in test_results_seen)
+      return "ERROR"
+    if ("FAIL" in test_results_seen || "XPASS" in test_results_seen)
+      return "FAIL"
+    all_skipped = 1
+    for (k in test_results_seen)
+      if (k != "SKIP")
+        all_skipped = 0
+    if (all_skipped)
+      return "SKIP"
+    return "PASS";
+}
+
+function stringify_result_obj(result_obj)
+{
+  if (result_obj["is_unplanned"] || result_obj["number"] != testno)
+    return "ERROR"
+
+  if (plan_seen == LATE_PLAN)
+    return "ERROR"
+
+  if (result_obj["directive"] == "TODO")
+    return result_obj["is_ok"] ? "XPASS" : "XFAIL"
+
+  if (result_obj["directive"] == "SKIP")
+    return result_obj["is_ok"] ? "SKIP" : COOKED_FAIL;
+
+  if (length(result_obj["directive"]))
+      abort("in function stringify_result_obj()")
+
+  return result_obj["is_ok"] ? COOKED_PASS : COOKED_FAIL
+}
+
+function decorate_result(result)
+{
+  color_name = color_for_result[result]
+  if (color_name)
+    return color_map[color_name] "" result "" color_map["std"]
+  # If we are not using colorized output, or if we do not know how
+  # to colorize the given result, we should return it unchanged.
+  return result
+}
+
+function report(result, details)
+{
+  if (result ~ /^(X?(PASS|FAIL)|SKIP|ERROR)/)
+    {
+      msg = ": " test_script_name
+      add_test_result(result)
+    }
+  else if (result == "#")
+    {
+      msg = " " test_script_name ":"
+    }
+  else
+    {
+      abort("in function report()")
+    }
+  if (length(details))
+    msg = msg " " details
+  # Output on console might be colorized.
+  print decorate_result(result) msg
+  # Log the result in the log file too, to help debugging (this is
+  # especially true when said result is a TAP error or "Bail out!").
+  print result msg | "cat >&3";
+}
+
+function testsuite_error(error_message)
+{
+  report("ERROR", "- " error_message)
+}
+
+function handle_tap_result()
+{
+  details = result_obj["number"];
+  if (length(result_obj["description"]))
+    details = details " " result_obj["description"]
+
+  if (plan_seen == LATE_PLAN)
+    {
+      details = details " # AFTER LATE PLAN";
+    }
+  else if (result_obj["is_unplanned"])
+    {
+       details = details " # UNPLANNED";
+    }
+  else if (result_obj["number"] != testno)
+    {
+       details = sprintf("%s # OUT-OF-ORDER (expecting %d)",
+                         details, testno);
+    }
+  else if (result_obj["directive"])
+    {
+      details = details " # " result_obj["directive"];
+      if (length(result_obj["explanation"]))
+        details = details " " result_obj["explanation"]
+    }
+
+  report(stringify_result_obj(result_obj), details)
+}
+
+# `skip_reason` should be empty whenever planned > 0.
+function handle_tap_plan(planned, skip_reason)
+{
+  planned += 0 # Avoid getting confused if, say, `planned` is "00"
+  if (length(skip_reason) && planned > 0)
+    abort("in function handle_tap_plan()")
+  if (plan_seen)
+    {
+      # Error, only one plan per stream is acceptable.
+      testsuite_error("multiple test plans")
+      return;
+    }
+  planned_tests = planned
+  # The TAP plan can come before or after *all* the TAP results; we speak
+  # respectively of an "early" or a "late" plan.  If we see the plan line
+  # after at least one TAP result has been seen, assume we have a late
+  # plan; in this case, any further test result seen after the plan will
+  # be flagged as an error.
+  plan_seen = (testno >= 1 ? LATE_PLAN : EARLY_PLAN)
+  # If testno > 0, we have an error ("too many tests run") that will be
+  # automatically dealt with later, so do not worry about it here.  If
+  # $plan_seen is true, we have an error due to a repeated plan, and that
+  # has already been dealt with above.  Otherwise, we have a valid "plan
+  # with SKIP" specification, and should report it as a particular kind
+  # of SKIP result.
+  if (planned == 0 && testno == 0)
+    {
+      if (length(skip_reason))
+        skip_reason = "- "  skip_reason;
+      report("SKIP", skip_reason);
+    }
+}
+
+function extract_tap_comment(line)
+{
+  if (index(line, diag_string) == 1)
+    {
+      # Strip leading `diag_string` from `line`.
+      line = substr(line, length(diag_string) + 1)
+      # And strip any leading and trailing whitespace left.
+      sub("^[ \t]*", "", line)
+      sub("[ \t]*$", "", line)
+      # Return what is left (if any).
+      return line;
+    }
+  return "";
+}
+
+# When this function is called, we know that line is a TAP result line,
+# so that it matches the (perl) RE "^(not )?ok\b".
+function setup_result_obj(line)
+{
+  # Get the result, and remove it from the line.
+  result_obj["is_ok"] = (substr(line, 1, 2) == "ok" ? 1 : 0)
+  sub("^(not )?ok[ \t]*", "", line)
+
+  # If the result has an explicit number, get it and strip it; otherwise,
+  # automatically assing the next progresive number to it.
+  if (line ~ /^[0-9]+$/ || line ~ /^[0-9]+[^a-zA-Z0-9_]/)
+    {
+      match(line, "^[0-9]+")
+      # The final `+ 0` is to normalize numbers with leading zeros.
+      result_obj["number"] = substr(line, 1, RLENGTH) + 0
+      line = substr(line, RLENGTH + 1)
+    }
+  else
+    {
+      result_obj["number"] = testno
+    }
+
+  if (plan_seen == LATE_PLAN)
+    # No further test results are acceptable after a "late" TAP plan
+    # has been seen.
+    result_obj["is_unplanned"] = 1
+  else if (plan_seen && testno > planned_tests)
+    result_obj["is_unplanned"] = 1
+  else
+    result_obj["is_unplanned"] = 0
+
+  # Strip trailing and leading whitespace.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+
+  # This will have to be corrected if we have a "TODO"/"SKIP" directive.
+  result_obj["description"] = line
+  result_obj["directive"] = ""
+  result_obj["explanation"] = ""
+
+  if (index(line, "#") == 0)
+    return # No possible directive, nothing more to do.
+
+  # Directives are case-insensitive.
+  rx = "[ \t]*#[ \t]*([tT][oO][dD][oO]|[sS][kK][iI][pP])[ \t]*"
+
+  # See whether we have the directive, and if yes, where.
+  pos = match(line, rx "$")
+  if (!pos)
+    pos = match(line, rx "[^a-zA-Z0-9_]")
+
+  # If there was no TAP directive, we have nothing more to do.
+  if (!pos)
+    return
+
+  # Let`s now see if the TAP directive has been escaped.  For example:
+  #  escaped:     ok \# SKIP
+  #  not escaped: ok \\# SKIP
+  #  escaped:     ok \\\\\# SKIP
+  #  not escaped: ok \ # SKIP
+  if (substr(line, pos, 1) == "#")
+    {
+      bslash_count = 0
+      for (i = pos; i > 1 && substr(line, i - 1, 1) == "\\"; i--)
+        bslash_count += 1
+      if (bslash_count % 2)
+        return # Directive was escaped.
+    }
+
+  # Strip the directive and its explanation (if any) from the test
+  # description.
+  result_obj["description"] = substr(line, 1, pos - 1)
+  # Now remove the test description from the line, that has been dealt
+  # with already.
+  line = substr(line, pos)
+  # Strip the directive, and save its value (normalized to upper case).
+  sub("^[ \t]*#[ \t]*", "", line)
+  result_obj["directive"] = toupper(substr(line, 1, 4))
+  line = substr(line, 5)
+  # Now get the explanation for the directive (if any), with leading
+  # and trailing whitespace removed.
+  sub("^[ \t]*", "", line)
+  sub("[ \t]*$", "", line)
+  result_obj["explanation"] = line
+}
+
+function get_test_exit_message(status)
+{
+  if (status == 0)
+    return ""
+  if (status !~ /^[1-9][0-9]*$/)
+    abort("getting exit status")
+  if (status < 127)
+    exit_details = ""
+  else if (status == 127)
+    exit_details = " (command not found?)"
+  else if (status >= 128 && status <= 255)
+    exit_details = sprintf(" (terminated by signal %d?)", status - 128)
+  else if (status > 256 && status <= 384)
+    # We used to report an "abnormal termination" here, but some Korn
+    # shells, when a child process die due to signal number n, can leave
+    # in $? an exit status of 256+n instead of the more standard 128+n.
+    # Apparently, both behaviours are allowed by POSIX (2008), so be
+    # prepared to handle them both.  See also Austing Group report ID
+    # 0000051 <http://www.austingroupbugs.net/view.php?id=51>
+    exit_details = sprintf(" (terminated by signal %d?)", status - 256)
+  else
+    # Never seen in practice.
+    exit_details = " (abnormal termination)"
+  return sprintf("exited with status %d%s", status, exit_details)
+}
+
+function write_test_results()
+{
+  print ":global-test-result: " get_global_test_result() > trs_file
+  print ":recheck: "  yn(must_recheck()) > trs_file
+  print ":copy-in-global-log: " yn(copy_in_global_log()) > trs_file
+  for (i = 0; i < test_results_index; i += 1)
+    print ":test-result: " test_results_list[i] > trs_file
+  close(trs_file);
+}
+
+BEGIN {
+
+## ------- ##
+##  SETUP  ##
+## ------- ##
+
+'"$init_colors"'
+
+# Properly initialized once the TAP plan is seen.
+planned_tests = 0
+
+COOKED_PASS = expect_failure ? "XPASS": "PASS";
+COOKED_FAIL = expect_failure ? "XFAIL": "FAIL";
+
+# Enumeration-like constants to remember which kind of plan (if any)
+# has been seen.  It is important that NO_PLAN evaluates "false" as
+# a boolean.
+NO_PLAN = 0
+EARLY_PLAN = 1
+LATE_PLAN = 2
+
+testno = 0     # Number of test results seen so far.
+bailed_out = 0 # Whether a "Bail out!" directive has been seen.
+
+# Whether the TAP plan has been seen or not, and if yes, which kind
+# it is ("early" is seen before any test result, "late" otherwise).
+plan_seen = NO_PLAN
+
+## --------- ##
+##  PARSING  ##
+## --------- ##
+
+is_first_read = 1
+
+while (1)
+  {
+    # Involutions required so that we are able to read the exit status
+    # from the last input line.
+    st = getline
+    if (st < 0) # I/O error.
+      fatal("I/O error while reading from input stream")
+    else if (st == 0) # End-of-input
+      {
+        if (is_first_read)
+          abort("in input loop: only one input line")
+        break
+      }
+    if (is_first_read)
+      {
+        is_first_read = 0
+        nextline = $0
+        continue
+      }
+    else
+      {
+        curline = nextline
+        nextline = $0
+        $0 = curline
+      }
+    # Copy any input line verbatim into the log file.
+    print | "cat >&3"
+    # Parsing of TAP input should stop after a "Bail out!" directive.
+    if (bailed_out)
+      continue
+
+    # TAP test result.
+    if ($0 ~ /^(not )?ok$/ || $0 ~ /^(not )?ok[^a-zA-Z0-9_]/)
+      {
+        testno += 1
+        setup_result_obj($0)
+        handle_tap_result()
+      }
+    # TAP plan (normal or "SKIP" without explanation).
+    else if ($0 ~ /^1\.\.[0-9]+[ \t]*$/)
+      {
+        # The next two lines will put the number of planned tests in $0.
+        sub("^1\\.\\.", "")
+        sub("[^0-9]*$", "")
+        handle_tap_plan($0, "")
+        continue
+      }
+    # TAP "SKIP" plan, with an explanation.
+    else if ($0 ~ /^1\.\.0+[ \t]*#/)
+      {
+        # The next lines will put the skip explanation in $0, stripping
+        # any leading and trailing whitespace.  This is a little more
+        # tricky in truth, since we want to also strip a potential leading
+        # "SKIP" string from the message.
+        sub("^[^#]*#[ \t]*(SKIP[: \t][ \t]*)?", "")
+        sub("[ \t]*$", "");
+        handle_tap_plan(0, $0)
+      }
+    # "Bail out!" magic.
+    # Older versions of prove and TAP::Harness (e.g., 3.17) did not
+    # recognize a "Bail out!" directive when preceded by leading
+    # whitespace, but more modern versions (e.g., 3.23) do.  So we
+    # emulate the latter, "more modern" behaviour.
+    else if ($0 ~ /^[ \t]*Bail out!/)
+      {
+        bailed_out = 1
+        # Get the bailout message (if any), with leading and trailing
+        # whitespace stripped.  The message remains stored in `$0`.
+        sub("^[ \t]*Bail out![ \t]*", "");
+        sub("[ \t]*$", "");
+        # Format the error message for the
+        bailout_message = "Bail out!"
+        if (length($0))
+          bailout_message = bailout_message " " $0
+        testsuite_error(bailout_message)
+      }
+    # Maybe we have too look for dianogtic comments too.
+    else if (comments != 0)
+      {
+        comment = extract_tap_comment($0);
+        if (length(comment))
+          report("#", comment);
+      }
+  }
+
+## -------- ##
+##  FINISH  ##
+## -------- ##
+
+# A "Bail out!" directive should cause us to ignore any following TAP
+# error, as well as a non-zero exit status from the TAP producer.
+if (!bailed_out)
+  {
+    if (!plan_seen)
+      {
+        testsuite_error("missing test plan")
+      }
+    else if (planned_tests != testno)
+      {
+        bad_amount = testno > planned_tests ? "many" : "few"
+        testsuite_error(sprintf("too %s tests run (expected %d, got %d)",
+                                bad_amount, planned_tests, testno))
+      }
+    if (!ignore_exit)
+      {
+        # Fetch exit status from the last line.
+        exit_message = get_test_exit_message(nextline)
+        if (exit_message)
+          testsuite_error(exit_message)
+      }
+  }
+
+write_test_results()
+
+exit 0
+
+} # End of "BEGIN" block.
+'
+
+# TODO: document that we consume the file descriptor 3 :-(
+} 3>"$log_file"
+
+test $? -eq 0 || fatal "I/O or internal error"
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/config.h.in b/config.h.in
new file mode 100644
index 0000000..5fa9546
--- /dev/null
+++ b/config.h.in
@@ -0,0 +1,89 @@
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* Defined if the requested minimum BOOST version is satisfied */
+#undef HAVE_BOOST
+
+/* Define to 1 if you have <boost/scoped_ptr.hpp> */
+#undef HAVE_BOOST_SCOPED_PTR_HPP
+
+/* Define to 1 if you have <boost/shared_ptr.hpp> */
+#undef HAVE_BOOST_SHARED_PTR_HPP
+
+/* Define to 1 if you have <boost/system/error_code.hpp> */
+#undef HAVE_BOOST_SYSTEM_ERROR_CODE_HPP
+
+/* Define to 1 if you have <boost/thread.hpp> */
+#undef HAVE_BOOST_THREAD_HPP
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Set to 1 if you have the "llvm/Analysis/Verifier.h" header file */
+#undef HAVE_LLVM_ANALYSIS_VERIFIER_H
+
+/* Set to 1 if you have the llvm::DataLayoutPass class */
+#undef HAVE_LLVM_DATA_LAYOUT_PASS
+
+/* Set to 1 if you have the "llvm/IR/Verifier.h" header file */
+#undef HAVE_LLVM_IR_VERIFIER_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#undef LT_OBJDIR
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Package copyright */
+#undef PACKAGE_COPYRIGHT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#undef VERSION
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 0000000..46bd45b
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,94 @@
+AC_INIT([lib6502-jit], [1.0], [lib6502-jit@lemma.co.uk])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_MACRO_DIR([m4])
+AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects no-dist-gzip dist-bzip2])
+AM_MAINTAINER_MODE([enable])
+LT_INIT([disable-shared])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_FILES([Makefile])
+AC_REQUIRE_AUX_FILE([tap-driver.sh])
+
+# Copyright for configure.ac *only*
+AC_COPYRIGHT([Copyright (c) 2014 Steven Flintham])
+
+AC_DEFINE([PACKAGE_COPYRIGHT], ["(C) - see COPYING"], [Package copyright])
+
+# for tap-driver.sh
+AC_PROG_AWK
+
+AC_PROG_CC
+AC_PROG_CXX
+
+BOOST_REQUIRE
+BOOST_SMART_PTR
+BOOST_THREAD
+
+# I want to:
+# - use "llvm-config" (relying on PATH) if the user doesn't do anything 
+#   special, but
+# - allow the user to say --with-llvm-config=XXX to use XXX instead of 
+#   llvm-config, where XXX might need to be found on the PATH (e.g. if 
+#   the program is called llvm-config-3.5) or might be an absolute/
+#   relative filename
+# In both of the above cases, I want to actually check explicitly the 
+# llvm-config program can be found. This doesn't seem to be supported by 
+# autoconf:
+# - AC_CHECK_PROG() and AC_PATH_PROG() both insist on the program name being a 
+#   leaf name with no included path.
+# - AC_CHECK_FILE() (not unreasonably) doesn't look on PATH for the file
+#   (and wouldn't check for executability)
+# So I have to just hack it with "which" and hope.
+AC_ARG_WITH(
+	[llvm-config], 
+	[AS_HELP_STRING(
+		[--with-llvm-config=FILE], 
+		[filename of llvm-config executable (if not on PATH)])], 
+	[LLVMCONFIG="$withval"], 
+	[LLVMCONFIG="llvm-config"])
+echo -n "checking for $LLVMCONFIG... "
+AS_IF(
+	[which "$LLVMCONFIG" >/dev/null],
+	[echo yes],
+	[echo no
+	 AC_MSG_ERROR([llvm-config not found; try --with-llvm-config=FILE?])])
+
+AC_SUBST(LLVMCONFIG)
+
+# These variables are sacred to the user. But we need to set them in order for
+# configure's test programs to find the LLVM headers. I am probably doing this
+# completely wrong. In twenty years or so maybe I will achieve auto-enlightenment
+# and look back at this and laugh.
+SACRED_CPPFLAGS="$CPPFLAGS"
+SACRED_CXXFLAGS="$CXXFLAGS"
+
+CPPFLAGS=["`$LLVMCONFIG --cppflags` $CPPFLAGS"]
+CXXFLAGS=["`$LLVMCONFIG --cxxflags` -fexceptions $CXXFLAGS"]
+
+AC_LANG(C++)
+
+# This header moves around a bit, check for the two known possible locations.
+
+AC_CHECK_HEADER(
+	[llvm/IR/Verifier.h], 
+	[AC_DEFINE([HAVE_LLVM_IR_VERIFIER_H], 1, [Set to 1 if you have the "llvm/IR/Verifier.h" header file])])
+AC_CHECK_HEADER(
+	[llvm/Analysis/Verifier.h], 
+	[AC_DEFINE([HAVE_LLVM_ANALYSIS_VERIFIER_H], 1, [Set to 1 if you have the "llvm/Analysis/Verifier.h" header file])])
+# TODO: Can I get configure to fail if neither of the previous tests
+# succeeds? Otherwise configure will succeed but the build will fail.
+
+# This header always exists, but DataLayoutPass isn't always present.
+AC_CHECK_HEADER(
+	[llvm/IR/DataLayout.h],
+	[],
+	[AC_MSG_ERROR([llvm/IR/DataLayout.h not found])])
+AC_CHECK_TYPE(
+	[llvm::DataLayoutPass],
+	[AC_DEFINE([HAVE_LLVM_DATA_LAYOUT_PASS], 1, [Set to 1 if you have the llvm::DataLayoutPass class])],
+	[],
+	[#include "llvm/IR/DataLayout.h"])
+
+CPPFLAGS="$SACRED_CPPFLAGS"
+CXXFLAGS="$SACRED_CXXFLAGS"
+
+AC_OUTPUT
diff --git a/const.h b/const.h
new file mode 100644
index 0000000..c2bbdfd
--- /dev/null
+++ b/const.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef CONST_H
+#define CONST_H
+
+#include <stdint.h>
+
+namespace
+{
+    const uint8_t opcode_brk = 0x00;
+    const uint8_t opcode_rti = 0x40;
+    const uint8_t opcode_rts = 0x60;
+    const uint8_t opcode_bra = 0x80;
+    const uint8_t opcode_bcc = 0x90;
+    const uint8_t opcode_bcs = 0xb0;
+    const uint8_t opcode_bvc = 0x50;
+    const uint8_t opcode_bvs = 0x70;
+    const uint8_t opcode_beq = 0xf0;
+    const uint8_t opcode_bne = 0xd0;
+    const uint8_t opcode_bpl = 0x10;
+    const uint8_t opcode_bmi = 0x30;
+    const uint8_t opcode_jsr = 0x20;
+    const uint8_t opcode_jmp_abs = 0x4c;
+    const uint8_t opcode_jmp_ind_abs = 0x6c;
+    const uint8_t opcode_jmp_indx_abs = 0x7c;
+
+    enum {
+      flagN= (1<<7),	/* negative 	 */
+      flagV= (1<<6),	/* overflow 	 */
+      flagX= (1<<5),	/* unused   	 */
+      flagB= (1<<4),	/* irq from brk  */
+      flagD= (1<<3),	/* decimal mode  */
+      flagI= (1<<2),	/* irq disable   */
+      flagZ= (1<<1),	/* zero          */
+      flagC= (1<<0)	/* carry         */
+    };
+    
+    const uint32_t memory_size = 0x10000;
+    const uint16_t stack = 0x100;
+}
+
+#endif
diff --git a/examples/README b/examples/README
new file mode 100644
index 0000000..e22418c
--- /dev/null
+++ b/examples/README
@@ -0,0 +1,406 @@
+lib6502 - 6502 Microprocessor Emulator
+
+EXAMPLES
+
+  This file has three sections:
+
+    1. PROGRAMS that you can compile and run
+    2. COMMANDS that you can copy and paste into a terminal
+    3. ADVANCED stuff that requires some additional setup
+
+  A few numbered footnotes appear at the end and are referenced in the
+  text in square brackets [6].
+
+----------------------------------------------------------------
+
+1.  PROGRAMS
+
+  (We're going to start in 'serious mode'.  Bear with me.)
+
+  The file 'lib1.c' contains the example from the run6502 manual page.
+  Just compile and run it:
+
+        cc -o lib1 lib1.c
+        ./lib1
+
+  The file has been commented extensively to explain exactly what is
+  going on.
+
+----------------------------------------------------------------
+
+2.  COMMANDS
+
+  (Much more fun: this is the section that appeals to the geek in me.)
+
+  6502 machine code is pretty straightforward.  (Many 6502 programmers
+  remember a time from their misguided childhood when they could
+  compose and edit programs directly in hexadecimal using their 'front
+  panel' monitor program -- the next best thing to programming with a
+  row of switches and lamps, but I digress and will leave that story
+  until the pdp11 emulator is ready. ;-)  We can use this fact to
+  generate an entire program without needing an assembler.  The 'perl'
+  program is available on most Unixy (and several other) systems and
+  makes it easy to create binary files from a string of hex digits.
+  (There is a program called 'xxd' that's very good at this kind of
+  thing, but you might not have it.)
+
+  First the program (stolen from lib1.c):
+
+        1000    ldx #41         A241
+        1002    txa             8A
+        1003    jsr FFEE        20EEFF
+        1006    inx             E8
+        1007    cpx #5B         E05B
+        1009    bne 1002        D0F7
+        100B    lda #0A         A90A
+        100D    jsr FFEE        20EEFF
+        1010    brk             00
+
+  In C-like syntax it is equivalent to:
+
+        regX = 'A';
+        do {
+          regA = regX;
+          putchar(regA);
+        } while (regX != 'Z' + 1);
+        putchar('\n');
+
+  (which by today's standards is a *huge* amount of stuff packed into
+  just 17 bytes of 'compiled' code -- on a 386 the same program is
+  around 65 bytes [1], and more like 88 bytes on a 32-bit RISC [2]).
+
+  The column on the right is the machine code in hexadecimal.  When
+  strung out in a line it looks like this:
+
+        A2418A20EEFFE8E05BD0F7A90A20EEFF00
+
+  We can tell perl to 'pack' this hexadecimal string into binary and
+  save the output in a file:
+
+        echo A2418A20EEFFE8E05BD0F7A90A20EEFF00 |
+        perl -e 'print pack "H*",<STDIN>' > temp.img
+
+  To check the contents of the file, we can load it into run6502 and
+  then disassemble it:
+
+        run6502 -l 1000 temp.img -d 1000 +11 -x
+
+  The '-l 1000 temp.img' loads the file into the 6502's memory at
+  address 0x1000, and the '-d 1000 +11' disassembles 17 bytes (11 in
+  hex) of code starting at 0x1000.  The final '-x' tells run6502 not
+  to try to execute the code.  The output should look just like the
+  program listing above.
+
+  This is almost all we need to run it; just a few details remain.
+
+    - The emulator doesn't know where to start execution.  We need to
+      set the 'reset' vector to 0x1000 -- the address of the first
+      instruction in the program.  The '-R 1000' option does this.
+
+    - The program calls the 'putchar' function at address 0xFFEE to
+      send a character to the terminal.  run6502 can emulate this for
+      us, with the '-P FFEE' option.
+
+    - We have to have some way to make the processor stop execution
+      (there is no 'halt' instruction on the 6502, at least not the
+      early versions).  The trick is in the last instruction 'BRK',
+      that generates a 'software interrupt' -- eventually jumping to
+      the addres in the 'interrupt vector'.  If we don't set the
+      interrupt vector explicitly it remains empty (zero) and BRK will
+      try to transfer control to address 0.  The '-X 0' option tells
+      run6502 to stop executing if/when the program attempts to
+      transfer control to address 0 -- which it will, when it executes
+      the 'BRK' instruction with an empty interrupt vector.  QED :-)
+
+  Here, then, is the complete command to run our program:
+
+        run6502 -l 1000 temp.img -R 1000 -P FFEE -X 0
+
+  This program is relocatable.  You can load it at address 4321
+  (change both the -l and -R options) and it will work just fine.
+
+  Google for "6502 Reference Card" (with the quotes), grab a pencil
+  and paper, and you can start writing 6502 programs immediately!  (If
+  you really want to experience what it was like in the late 1970s,
+  but without the added fun of entering each hex digit one at a time
+  into a monitor program, simply avoid the temptation ever to look at
+  your hand-assembled code with the '-d' option. ;-)
+
+  If you really start liking this and want to write longer programs in
+  text files with the hex split over many lines, you'll need a perl
+  script that can deal with newlines in the input.  Something like
+  this should do the trick...
+
+        #!/usr/bin/perl
+
+        while (<STDIN>) {
+          chomp;
+          print pack "H*", $_
+        }
+
+  (This script is included in the 'examples' directory, in a file
+  called 'hex2bin', to save you 15 seconds of copy and paste.)
+
+  Need a fun project?  Write a 6502 assembler... in 6502 machine code,
+  of course!  Read in the assembly language text via 'getchar' (see
+  the '-G' option) and write out the assembled binary via 'putchar'
+  (the '-P' option, that we've already seen).  Soon you'll be able to:
+
+        cat prog.s |
+        run6502 -l 1000 asm.img -R 1000 -G FFE0 -P FFEE -X 0 > prog.img
+
+        run6502 -l 1000 prog.img -R 1000 -G FFE0 -P FFEE -X 0
+
+  (The first prog.s you write should probably be the assembler itself,
+  transcribed from the paper copy used to hand-assemble the assembler
+  binary.  This significant milestone can be reached with a
+  surprisingly simple assembler.  After this pivotal moment the
+  assembler, assembling itself, can very quickly become very
+  powerful.)
+
+----------------------------------------------------------------
+
+3. ADVANCED
+
+  (Official justification: let's run something big and non-trivial.
+  More likely: a flimsy excuse for a trip down memory lane.)
+
+  The remaining examples assume that you have access to two ROM images
+  from the Acorn 'BBC Model B' microcomputer: the operating system and
+  the BASIC language .  (Just crawl into the attic, fire up the old
+  Beeb, '*SAVE' the images into files, and then transfer them to your
+  Unix box over RS423.  Under no circumstances should you google for
+  'Acorn BBC B OS ROMs zip', without the quotes.  That would be
+  naughty, and probably illegal -- at least until the glorious day
+  when the revolution finally comes.)
+
+  After brushing yourself down (the attic is kind of dusty, no?) save
+  the two ROM images as 'OS12.ROM' and 'BASIC2.ROM'.
+
+  The first thing we can do is use run6502 as an editor to merge the
+  two ROMs into a single image file:
+
+        run6502                         \
+          -l C000 OS12.ROM              \
+          -l 8000 BASIC2.ROM            \
+          -s 0000 +10000 bbc.img        \
+          -x
+
+  (This is a single command, with '\' continuation characters joining
+  the lines into one.  Your shell should figure it out if you just
+  copy and paste.)  It leaves a file 'bbc.img' containing both the OS
+  and BASIC.
+
+  To run this image we need the '-B' option.  It enables some minimal,
+  totally lame, hardware emulation of the BBC computer -- just enough
+  to boot the 'virtual beeb' into BASIC [3]:
+
+        run6502 -l 0 bbc.img -B
+
+  If all goes well, you should be greeted with a 'beep' and a message
+  telling you what computer you have (BBC Computer), how much RAM is
+  available (32K), the language you've been dropped into (BASIC), and
+  a '>' prompt.  Turn on 'CAPS LOCK' (many of us remember those days,
+  and some of us even used to speak in ALL CAPS) and play:
+
+        PRINT 3+4
+
+  or maybe:
+
+        10 FOR A%=1 TO 10
+        20 PRINT A%
+        30 NEXT
+        LIST
+        RUN
+
+  or even:
+
+         10 P%=&2800
+         20 O%=P%
+         30 [
+         40    opt3
+         50    lda #10
+         60    jsr &FFEE
+         70    ldx #65
+         80 .l txa
+         90    jsr &FFEE
+        100    inx
+        110    cpx #91
+        120    bne l
+        130    lda #10
+        140    jmp &FFEE
+        150 ]
+        160 CALL &2800
+        LIST
+        RUN
+
+  (How cool is that? ;-)
+
+  One final thing: there is an option '-i' that works just like '-l'
+  except that it looks to see if the image file begins with '#!'.  If
+  so, it skips over the first line of the file, up to and including
+  the first newline.  Why?  The system call that executes programs on
+  Unixy systems makes the same check.  If the user executes a text
+  file 'foo' staring with '#!prog ...' then the OS loads and runs
+  'prog' instead, passing all the '...'s and the name of the text file
+  'foo' as arguments [4].  If you have 'temp.img' left over from from
+  the second example, open it in a text editor and add a single line
+  at the beginning that reads:
+
+        #!run6502 -i 1000
+
+  (If 'run6502' is not in your current working directory then you will
+  have to use the full path to the file: '#!/usr/bin/run6502' or
+  '#!/usr/local/bin/6502' or whatever.  No spaces before the '#'!)
+
+  Now make the image executable:
+
+        chmod +x temp.img
+
+  and then (as if you hadn't already guessed) execute it:
+
+        ./temp.img
+
+  Saves an awful lot of tedious typing. [5]
+
+  Have fun!
+
+----------------------------------------------------------------
+
+FOOTNOTES
+
+
+[1] Here is the 'alphabet' program, verbatim, compiled (with
+    optimisation) on a 386.  It's 66 bytes long, almost four times
+    longer than the 6502 version.  (If I were more generous I might
+    consider that fair: 32 bits divided by 8 bits is four.)
+
+       0:   55                      push   %ebp
+       1:   89 e5                   mov    %esp,%ebp
+       3:   53                      push   %ebx
+       4:   83 ec 14                sub    $0x14,%esp
+       7:   bb 41 00 00 00          mov    $0x41,%ebx
+       c:   a1 00 00 00 00          mov    0x0,%eax
+      11:   89 44 24 04             mov    %eax,0x4(%esp)
+      15:   89 1c 24                mov    %ebx,(%esp)
+      18:   e8 fc ff ff ff          call   19 <fputc>
+      1d:   43                      inc    %ebx
+      1e:   83 fb 5b                cmp    $0x5b,%ebx
+      21:   75 e9                   jne    c <prog+0xc>
+      23:   a1 00 00 00 00          mov    0x0,%eax
+      28:   89 44 24 04             mov    %eax,0x4(%esp)
+      2c:   c7 04 24 0a 00 00 00    movl   $0xa,(%esp)
+      33:   e8 fc ff ff ff          call   34 <fputc>
+      38:   b8 00 00 00 00          mov    $0x0,%eax
+      3d:   83 c4 14                add    $0x14,%esp
+      40:   5b                      pop    %ebx
+      41:   5d                      pop    %ebp
+      42:   c3                      ret    
+
+
+[2] Here is the 'alphabet' program, verbatim, compiled (with
+    optimisation) on a PowerPC.  It's 88 bytes long, more than five
+    times longer than the 6502 version.  (I don't care what you say:
+    Apple Macs rule and mine has oodles of RAM to spare.)
+
+    00000000        mfspr   r0,lr
+    00000004        stmw    r29,0xfff4(r1)
+    00000008        stw     r0,0x8(r1)
+    0000000c        stwu    r1,0xffb0(r1)
+    00000010        bcl     20,31,0x14
+    00000014        mfspr   r31,lr
+    00000018        li      r30,0x41
+    0000001c        addis   r2,r31,ha16(0xa4-0x14)
+    00000020        lwz     r29,lo16(0xa4-0x14)(r2)
+    00000024        or      r3,r30,r30
+    00000028        addi    r4,r29,0x58
+    0000002c        bl      0x7c    ; symbol stub for: _fputc
+    00000030        cmpwi   cr7,r30,0x5a
+    00000034        addi    r30,r30,0x1
+    00000038        bne     cr7,0x24
+    0000003c        li      r3,0xa
+    00000040        bl      0x5c    ; symbol stub for: _fputc
+    00000044        li      r3,0x0
+    00000048        lwz     r0,0x58(r1)
+    0000004c        addi    r1,r1,0x50
+    00000050        mtspr   lr,r0
+    00000054        lmw     r29,0xfff4(r1)
+    00000058        blr
+
+
+[3] Time to 'fess up with an undocumented 'feature'.  We ran our
+    'bbc.img' file like this:
+
+        run6502 -l 0 bbc.img -B
+
+    I grew tired of typing all those '-'s and made run6502 check to
+    see if it was invoked with a single, non-option argument.
+    Running:
+
+        run6502 bbc.img
+
+    is precisely equivalent to the '-l -B' form above.  I don't feel
+    too guilty about this since the manual page suggests that
+    providing a single, non-option argument is illegal usage.
+
+
+[4] Okay, that might be a little confusing.  Here it is written out in
+    full.  If you have a text file called 'foo' containing
+
+        #!/usr/bin/prog -gobble
+        blah blah blah
+        blah blah blah
+
+    that is executable, and then you execute it like a compiled
+    program
+
+        ./foo
+
+    then the OS will notice the '#!' and run the following command
+    instead:
+
+        /usr/bin/prog -gobble ./foo
+
+    The '-gobble' tells 'prog' to eat the first line, leaving just the
+    blah that follows.  (The reason for choosing '#!' is that '#' is
+    the comment character in the standard Unix shell, with the obvious
+    happy consequences for shell scripts.)
+
+
+[5] We can play the same '#!' game with our 'bbc.img' file.  Open it
+    up and add the line
+
+        #!/usr/local/bin/run6502 -B -l 0
+
+    (or whatever, according to the location of the 'run6502' program),
+    make it executable
+
+        chmod +x bbc.img
+
+    and execute it:
+
+        ./bbc.img
+
+    To save a whopping 32K of zeros at the beginning of the file,
+    create the image again with
+
+        run6502                 \
+          -l C000 OS12.ROM      \
+          -l 8000 BASIC2.ROM    \
+          -s 8000 +8000 bbc.img \
+          -x
+
+    and run it with
+
+        run6502 -l 0 bbc.img -B
+
+    and, if you like, insert the single line
+
+        #!/usr/local/bin/run6502 -B -l 8000
+
+    at the start of the image file and make it executable:
+
+        ./bbc.img
+
+
+[6] There is no footnote 6.
diff --git a/examples/hex2bin b/examples/hex2bin
new file mode 100755
index 0000000..82c2a44
--- /dev/null
+++ b/examples/hex2bin
@@ -0,0 +1,6 @@
+#!/usr/bin/perl
+
+while (<STDIN>) {
+  chomp;
+  print pack "H*", $_
+}
diff --git a/examples/lib1.c b/examples/lib1.c
new file mode 100644
index 0000000..6b89520
--- /dev/null
+++ b/examples/lib1.c
@@ -0,0 +1,108 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+
+/* Emulated OS functions. */
+
+#define WRCH	0xFFEE	/* Write accumulator to stdout. */
+
+/* Write the accumulator to stdout.  This function will be invoked
+ * when the emulated program calls 0xFFEE.
+ */
+int wrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  int pc;
+
+  /* Write the character.
+   */
+  putchar(mpu->registers->a);
+
+  /* We arrived here from a JSR instruction.  The stack contains the
+   * saved PC.  Pop it off the stack.
+   */
+  pc  = mpu->memory[++mpu->registers->s + 0x100];
+  pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;
+
+  /* The JSR instruction pushes the value of PC before it has been
+   * incremented to point to the instruction after the JSR.  Return PC
+   * + 1 as the address for the next insn.  Returning non-zero
+   * indicates that we handled the 'subroutine' ourselves, and the
+   * emulator should pretend the original 'JSR' neveer happened at
+   * all.
+   */
+  return pc + 1;  /* JSR pushes next insn addr - 1 */
+}
+
+
+/* Exit gracefully.  We arrange for this function to be called when
+ * the emulator tries to transfer control to address 0.
+ */
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+
+  /* Dump the internal state of the processor.
+   */
+  M6502_dump(mpu, buffer);
+
+  /* Print a cute message and quit.
+   */
+  printf("\nBRK instruction\n%s\n", buffer);
+  exit(0);
+}
+
+int main()
+{
+  M6502    *mpu = M6502_new(0, 0, 0);	/* Make a 6502 */
+  unsigned  pc  = 0x1000;		/* PC for 'assembly' */
+
+  /* Install the two callback functions defined above.
+   */
+  M6502_setCallback(mpu, call, WRCH, wrch);	/* Calling FFEE -> wrch() */
+  M6502_setCallback(mpu, call,    0, done);	/* Calling 0 -> done() */
+
+  /* A few macros that dump bytes into the 6502's memory.
+   */
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  /* Hand-assemble the program.
+   */
+  gen2(0xA2, 'A'     );	// LDX #'A'
+  gen1(0x8A          );	// TXA
+  gen3(0x20,0xEE,0xFF);	// JSR FFEE
+  gen1(0xE8          );	// INX
+  gen2(0xE0, 'Z'+1   );	// CPX #'Z'+1
+  gen2(0xD0, -9      );	// BNE 0x1002
+  gen2(0xA9, '\n'    );	// LDA #'\n'
+  gen3(0x20,0xEE,0xFF);	// JSR FFEE
+  gen2(0x00,0x00     ); // BRK
+
+  /* Just for fun: disssemble the program.
+   */
+  {
+    char     insn[64];
+    uint16_t ip= 0x1000;
+    while (ip < pc)
+      {
+	int isz = M6502_disassemble(mpu, ip, insn);
+	printf("%04X %s\n", ip, insn);
+	ip += isz;
+      }
+  }
+
+  /* Point the RESET vector at the first instruction in the assembled
+   * program.
+   */
+  M6502_setVector(mpu, RST, 0x1000);
+
+  /* Reset the 6502 and run the program.
+   */
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/lib6502-compatibility.txt b/lib6502-compatibility.txt
new file mode 100644
index 0000000..23f88d2
--- /dev/null
+++ b/lib6502-compatibility.txt
@@ -0,0 +1,54 @@
+At the time of writing the latest lib6502 release is v1.3; older versions are
+not considered here.
+
+Some things which work fine with lib6502 itself are not supported when using
+lib6502-jit in hybrid (the default) or compiled execution modes. All of the
+following will result in undefined behaviour unless interpreted mode is used:
+
+* Modifying memory which contains 6502 code (whether executed yet or not)
+  inside a read callback. (All other types of callbacks are allowed to
+  modify memory freely, including modifying code.)
+
+* Defining a callback after calling M6502_run(); for example, doing so inside
+  another callback.
+
+* Checking the B and X flags in the processor status register
+  (M6502_Registers.p) inside a callback. lib6502 tracks these flags as if they
+  have a real existence at all times. lib6502-jit's compiler only sets them
+  appropriately when pushing a copy of the processor status register onto the
+  stack. This difference is *not* visible to code executing on the emulated CPU,
+  only to callbacks. In hybrid mode, which behaviour you get will depend on
+  whether your callback is invoked from the interpreter or compiled code.
+
+The following differences exist between lib6502 and lib6502-jit in all modes,
+including interpreted mode:
+
+* lib6502 is likely to be slightly faster than lib6502-jit in interpreted mode,
+  since the latter's interpreter code contains additional tests to stop
+  executing at certain points after n instructions have been executed.
+
+* Illegal instructions are treated as no-ops by default in lib6502-jit; lib6502
+  aborts if an illegal instruction is executed.
+
+* Illegal instruction callbacks are a lib6502-jit extension and are not
+  available in lib6502.
+
+* Call callbacks in lib6502 always receive a 0 as the data argument;
+  lib6502-jit supplies the opcode triggering the callback as the data argument.
+
+* A few bugs in lib6502's emulation are resolved in lib6502-jit:
+  - BRK clears the D flag
+  - ADC/SBC exactly match the behaviour of a real 65C02 in decimal mode
+  - BIT #imm only modifies the Z flag, leaving N and V untouched
+  - TSB sets the Z flag correctly
+  - TRB sets the Z flag and updates memory correctly
+
+* lib6502's run6502 -B option skips every other (ROM name) argument;
+  lib6502-jit's doesn't.
+
+lib6502-jit's stance is that anything the code executing on the emulated CPU
+does is fair game and must be handled, but that the library's client code has a
+responsibility to cooperate and not do tricky things like those documented
+above. If you have what you think is a reasonable requirement for behaviour
+which is supported by lib6502 but doesn't work on lib6502-jit please get in
+touch.
diff --git a/lib6502-jit.cpp b/lib6502-jit.cpp
new file mode 100644
index 0000000..02da212
--- /dev/null
+++ b/lib6502-jit.cpp
@@ -0,0 +1,190 @@
+/* lib6502-jit.cpp -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "const.h"
+#include "Function.h"
+#include "FunctionBuilder.h"
+#include "FunctionManager.h"
+#include "M6502Internal.h"
+#include "Registers.h"
+#include "util.h"
+
+static void outOfMemory(void)
+{
+    die("out of memory");
+}
+
+M6502 *M6502_new(M6502_Registers *registers, M6502_Memory memory, M6502_Callbacks *callbacks)
+{
+  M6502 *mpu= (M6502 *) calloc(1, sizeof(M6502));
+  if (!mpu) outOfMemory();
+
+  if (!registers)  { registers = (M6502_Registers *)calloc(1, sizeof(M6502_Registers));  mpu->flags |= M6502_RegistersAllocated; }
+  if (!memory   )  { memory    = (uint8_t         *)calloc(1, sizeof(M6502_Memory   ));  mpu->flags |= M6502_MemoryAllocated;    }
+  if (!callbacks)  { callbacks = (M6502_Callbacks *)calloc(1, sizeof(M6502_Callbacks));  mpu->flags |= M6502_CallbacksAllocated; }
+
+  if (!registers || !memory || !callbacks) outOfMemory();
+
+  mpu->registers = registers;
+  mpu->memory    = memory;
+  mpu->callbacks = callbacks;
+
+  try
+  {
+    mpu->internal = new _M6502_Internal(mpu);
+  }
+  catch (std::exception &e)
+  {
+    die(e.what());
+  }
+
+  return mpu;
+}
+ 
+void M6502_delete(M6502 *mpu)
+{
+  if (mpu->flags & M6502_CallbacksAllocated) free(mpu->callbacks);
+  if (mpu->flags & M6502_MemoryAllocated   ) free(mpu->memory);
+  if (mpu->flags & M6502_RegistersAllocated) free(mpu->registers);
+  delete mpu->internal;
+
+  free(mpu);
+}
+
+void M6502_setMode(M6502 *mpu, M6502_Mode mode, int arg)
+{
+    mpu->internal->mode_ = mode;
+
+    if (arg == 0)
+    {
+        arg = M6502_Internal::default_max_instructions_;
+    }
+    mpu->internal->max_instructions_ = arg;
+} 
+
+extern "C" void M6502_run_interpreted(M6502 *mpu, int instructions_left);
+
+// I don't know if it's "supposed" to work, but it doesn't seem completely
+// unreasonable for a lib6502 client to do a setjmp() before invoking
+// M6502_run() and have a callback function longjmp() out of the emulation. I
+// believe this will work with lib6502 itself, and I would like this emulation
+// to do the same.  (Note that currently for both lib6502 and lib6502-jit,
+// read/write callbacks don't see an up-to-date M6502_Registers object and so
+// the setjmp/longjmp trick would result in restarting execution in the wrong
+// place with the wrong registers. Call callbacks and illegal instruction
+// callbacks should work though.)
+//
+// To this end, M6502_run_compiled() and M6502_run_hybrid() both update the
+// Registers object from the M6502_Registers object on entry to pick up the
+// current state. They also both ensure they call update_memory_snapshot() as
+// appropriate in case the caller modified memory before invoking M6502_run()
+// again.
+
+static void M6502_run_compiled(M6502 *mpu)
+{
+    FunctionManager &function_manager = mpu->internal->function_manager_;
+    function_manager.update_memory_snapshot();
+
+    Registers &registers = mpu->internal->registers_;
+    registers.from_M6502_Registers(mpu);
+
+    while (true)
+    {
+        Function *f = function_manager.get_function(registers.pc);
+        TRACE("Executing Function object for address 0x" << std::hex <<
+              std::setfill('0') << std::setw(4) << registers.pc);
+        f->execute();
+    }
+}
+
+#ifdef LOG
+
+static std::string M6502_dump_str(M6502 *mpu)
+{
+    char buffer[64];
+    M6502_dump(mpu, buffer);
+    return buffer;
+}
+
+#endif
+
+static void M6502_run_hybrid(M6502 *mpu)
+{
+    FunctionManager &function_manager = mpu->internal->function_manager_;
+    Registers &registers = mpu->internal->registers_;
+    registers.from_M6502_Registers(mpu);
+    TRACE("About to interpret, CPU state: " << M6502_dump_str(mpu));
+    while (true)
+    {
+        const int instructions_to_interpret = 100;
+        M6502_run_interpreted(mpu, instructions_to_interpret);
+        if (function_manager.jit_thread_idle())
+        {
+            TRACE("JIT thread is idle");
+            registers.from_M6502_Registers(mpu);
+            function_manager.update_memory_snapshot();
+            Function *f;
+            while ((f = function_manager.get_function_lazy(registers.pc)) != 0)
+            {
+                TRACE("Executing Function object for address 0x" << std::hex <<
+                      std::setfill('0') << std::setw(4) << registers.pc);
+                f->execute();
+            }
+            TRACE("No Function object available for address 0x" << std::hex <<
+                  std::setfill('0') << std::setw(4) << registers.pc <<
+                  ", falling back to interpreter");
+            registers.to_M6502_Registers(mpu);
+            TRACE("About to interpret, CPU state: " << M6502_dump_str(mpu));
+        }
+    }
+}
+
+void M6502_run(M6502 *mpu)
+{
+    try
+    {
+        switch (mpu->internal->mode_)
+        {
+            case M6502_ModeInterpreted:
+                while (true)
+                {
+                    M6502_run_interpreted(mpu, std::numeric_limits<int>::max());
+                }
+                break;
+
+            case M6502_ModeCompiled:
+                M6502_run_compiled(mpu);
+                break;
+
+            case M6502_ModeHybrid:
+                M6502_run_hybrid(mpu);
+                break;
+
+            default:
+                die("Unknown execution mode in M6502_run()");
+        }
+
+        die("M6502_run() returned!");
+    }
+    catch (std::exception &e)
+    {
+        die(e.what());
+    }
+}
diff --git a/lib6502.c b/lib6502.c
new file mode 100644
index 0000000..866e1b9
--- /dev/null
+++ b/lib6502.c
@@ -0,0 +1,910 @@
+/* lib6502.c -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* BUGS:
+ *   - RTS and RTI do not check the return address for a callback
+ *   - the disassembler cannot be configured to read two bytes for BRK
+ *   - architectural variations (unimplemented/extended instructions) not implemented
+ *   - ANSI versions (from from gcc extensions) of the dispatch macros are missing
+ *   - emulator+disassembler in same object file (library is kind of pointless)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+
+typedef uint8_t  byte;
+typedef uint16_t word;
+
+enum {
+  flagN= (1<<7),	/* negative 	 */
+  flagV= (1<<6),	/* overflow 	 */
+  flagX= (1<<5),	/* unused   	 */
+  flagB= (1<<4),	/* irq from brk  */
+  flagD= (1<<3),	/* decimal mode  */
+  flagI= (1<<2),	/* irq disable   */
+  flagZ= (1<<1),	/* zero          */
+  flagC= (1<<0)		/* carry         */
+};
+
+#define getN()	(P & flagN)
+#define getV()	(P & flagV)
+#define getB()	(P & flagB)
+#define getD()	(P & flagD)
+#define getI()	(P & flagI)
+#define getZ()	(P & flagZ)
+#define getC()	(P & flagC)
+
+#define setNVZC(N,V,Z,C)	(P= (P & ~(flagN | flagV | flagZ | flagC)) | (N) | ((V)<<6) | ((Z)<<1) | (C))
+#define setNZC(N,Z,C)		(P= (P & ~(flagN |         flagZ | flagC)) | (N) |            ((Z)<<1) | (C))
+#define setNZ(N,Z)		(P= (P & ~(flagN |         flagZ        )) | (N) |            ((Z)<<1)      )
+#define setZ(Z)			(P= (P & ~(                flagZ        )) |                  ((Z)<<1)      )
+#define setC(C)			(P= (P & ~(                        flagC)) |                             (C))
+
+#define NAND(P, Q)	(!((P) & (Q)))
+
+#define tick(n)
+#define tickIf(p)
+
+/* memory access (indirect if callback installed) -- ARGUMENTS ARE EVALUATED MORE THAN ONCE! */
+
+#define putMemory(ADDR, BYTE)			\
+  ( writeCallback[ADDR]				\
+      ? writeCallback[ADDR](mpu, ADDR, BYTE)	\
+      : (memory[ADDR]= BYTE) )
+
+#define getMemory(ADDR)				\
+  ( readCallback[ADDR]				\
+      ?  readCallback[ADDR](mpu, ADDR, 0)	\
+      :  memory[ADDR] )
+
+/* stack access (always direct) */
+
+#define push(BYTE)		(memory[0x0100 + S--]= (BYTE))
+#define pop()			(memory[++S + 0x0100])
+
+/* adressing modes (memory access direct) */
+
+#define implied(ticks)				\
+  tick(ticks);
+
+#define immediate(ticks)			\
+  tick(ticks);					\
+  ea= PC++;
+
+#define abs(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC] + (memory[PC + 1] << 8);	\
+  PC += 2;
+
+#define relative(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++];				\
+  if (ea & 0x80) ea -= 0x100;			\
+  tickIf((ea >> 8) != (PC >> 8));
+
+#define indirect(ticks)				\
+  tick(ticks);					\
+  {						\
+    word tmp;					\
+    tmp= memory[PC]  + (memory[PC  + 1] << 8);	\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);	\
+    PC += 2;					\
+  }
+
+#define absx(ticks)						\
+  tick(ticks);							\
+  ea= memory[PC] + (memory[PC + 1] << 8);			\
+  PC += 2;							\
+  tickIf((ticks == 4) && ((ea >> 8) != ((ea + X) >> 8)));	\
+  ea += X;
+
+#define absy(ticks)						\
+  tick(ticks);							\
+  ea= memory[PC] + (memory[PC + 1] << 8);			\
+  PC += 2;							\
+  tickIf((ticks == 4) && ((ea >> 8) != ((ea + Y) >> 8)));	\
+  ea += Y
+
+#define zp(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++];
+
+#define zpx(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++] + X;				\
+  ea &= 0x00ff;
+
+#define zpy(ticks)				\
+  tick(ticks);					\
+  ea= memory[PC++] + Y;				\
+  ea &= 0x00ff;
+
+#define indx(ticks)				\
+  tick(ticks);					\
+  {						\
+    byte tmp= memory[PC++] + X;			\
+    ea= memory[tmp] + (memory[tmp + 1] << 8);	\
+  }
+
+#define indy(ticks)						\
+  tick(ticks);							\
+  {								\
+    byte tmp= memory[PC++];					\
+    ea= memory[tmp] + (memory[tmp + 1] << 8);			\
+    tickIf((ticks == 5) && ((ea >> 8) != ((ea + Y) >> 8)));	\
+    ea += Y;							\
+  }
+
+#define indabsx(ticks)					\
+  tick(ticks);						\
+  {							\
+    word tmp;						\
+    tmp= memory[PC ] + (memory[PC  + 1] << 8) + X;	\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);		\
+  }
+
+#define indzp(ticks)					\
+  tick(ticks);						\
+  {							\
+    byte tmp;						\
+    tmp= memory[PC++];					\
+    ea = memory[tmp] + (memory[tmp + 1] << 8);		\
+  }
+
+/* insns */
+
+#define adc(ticks, adrmode)								\
+  adrmode(ticks);									\
+  {											\
+    byte B= getMemory(ea);								\
+    if (!getD())									\
+      {											\
+	int c= A + B + getC();								\
+	int v= (int8_t)A + (int8_t)B + getC();						\
+	fetch();									\
+	A= c;										\
+	setNVZC((A & 0x80), (((A & 0x80) > 0) ^ (v < 0)), (A == 0), ((c & 0x100) > 0));	\
+	next();										\
+      }											\
+    else										\
+      {											\
+	/* Algorithm taken from http://www.6502.org/tutorials/decimal_mode.html */      \
+	/* inelegant & slow, but consistent with the hw for illegal digits */		\
+	int l, s, t, v;									\
+	l= (A & 0x0F) + (B & 0x0F) + getC();						\
+	if (l >= 0x0A) { l = ((l + 0x06) & 0x0F) + 0x10; }				\
+	s= (A & 0xF0) + (B & 0xF0) + l;							\
+	t= (int8_t)(A & 0xF0) + (int8_t)(B & 0xF0) + (int8_t)l;				\
+	v= (t < -128) || (t > 127);							\
+	if (s >= 0xA0) { s += 0x60; }							\
+        fetch();									\
+	A= s;										\
+	/* only C is valid on NMOS 6502 */						\
+	setNVZC(s & 0x80, v, !A, (s >= 0x100));						\
+	tick(1);									\
+	next();										\
+      }											\
+  }
+
+#define sbc(ticks, adrmode)								\
+  adrmode(ticks);									\
+  {											\
+    byte B= getMemory(ea);								\
+    if (!getD())									\
+      {											\
+	int b= 1 - (P &0x01);								\
+	int c= A - B - b;								\
+	int v= (int8_t)A - (int8_t) B - b;						\
+	fetch();									\
+	A= c;										\
+	setNVZC(A & 0x80, ((A & 0x80) > 0) ^ ((v & 0x100) != 0), A == 0, c >= 0);	\
+	next();										\
+      }											\
+    else										\
+      {											\
+	/* Algorithm taken from http://www.6502.org/tutorials/decimal_mode.html */      \
+	int b= 1 - (P &0x01);								\
+	int l= (A & 0x0F) - (B & 0x0F) - b;	 					\
+	int s= A - B + getC() - 1;							\
+	int c= !(s & 0x100);								\
+	int v= (int8_t)A - (int8_t) B - b;						\
+      	if (s < 0) { s -= 0x60; } 							\
+	if (l < 0) { s -= 0x06; }							\
+	fetch(); 									\
+	A = s;										\
+	/* only C is valid on NMOS 6502 */						\
+	setNVZC(s & 0x80, ((v & 0x80) > 0) ^ ((v & 0x100) != 0), !A, c);		\
+	tick(1);									\
+	next();										\
+      }											\
+  }
+
+#define cmpR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    byte d= R - B;				\
+    setNZC(d & 0x80, !d, R >= B);		\
+  }						\
+  next();
+
+#define cmp(ticks, adrmode)	cmpR(ticks, adrmode, A)
+#define cpx(ticks, adrmode)	cmpR(ticks, adrmode, X)
+#define cpy(ticks, adrmode)	cmpR(ticks, adrmode, Y)
+
+#define dec(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    --B;					\
+    putMemory(ea, B);				\
+    setNZ(B & 0x80, !B);			\
+  }						\
+  next();
+
+#define decR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  --R;						\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define dea(ticks, adrmode)	decR(ticks, adrmode, A)
+#define dex(ticks, adrmode)	decR(ticks, adrmode, X)
+#define dey(ticks, adrmode)	decR(ticks, adrmode, Y)
+
+#define inc(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    ++B;					\
+    putMemory(ea, B);				\
+    setNZ(B & 0x80, !B);			\
+  }						\
+  next();
+
+#define incR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  ++R;						\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define ina(ticks, adrmode)	incR(ticks, adrmode, A)
+#define inx(ticks, adrmode)	incR(ticks, adrmode, X)
+#define iny(ticks, adrmode)	incR(ticks, adrmode, Y)
+
+#define bit(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    P= (P & ~(flagN | flagV | flagZ))		\
+      | (B & (0xC0)) | (((A & B) == 0) << 1);	\
+  }						\
+  next();
+
+/* BIT is unique in varying its behaviour based on addressing mode;
+ * BIT immediate only modifies the Z flag.
+ * http://6502.org/tutorials/65c02opcodes.html
+ */
+#define bim(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte B= getMemory(ea);			\
+    setZ((A & B) == 0);                  	\
+  }						\
+  next();
+
+#define tsb(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte b= getMemory(ea);			\
+    setZ(!(b & A));				\
+    b |= A;					\
+    putMemory(ea, b);				\
+  }						\
+  next();
+
+#define trb(ticks, adrmode)			\
+  adrmode(ticks);				\
+  fetch();					\
+  {						\
+    byte b= getMemory(ea);			\
+    setZ(!(b & A));				\
+    b &= (A ^ 0xFF);				\
+    putMemory(ea, b);				\
+  }						\
+  next();
+
+#define bitwise(ticks, adrmode, op)		\
+  adrmode(ticks);				\
+  fetch();					\
+  A op##= getMemory(ea);			\
+  setNZ(A & 0x80, !A);				\
+  next();
+
+#define and(ticks, adrmode)	bitwise(ticks, adrmode, &)
+#define eor(ticks, adrmode)	bitwise(ticks, adrmode, ^)
+#define ora(ticks, adrmode)	bitwise(ticks, adrmode, |)
+
+#define asl(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    unsigned int i= getMemory(ea) << 1;		\
+    putMemory(ea, i);				\
+    fetch();					\
+    setNZC(i & 0x80, !i, i >> 8);		\
+  }						\
+  next();
+
+#define asla(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    int c= A >> 7;				\
+    A <<= 1;					\
+    setNZC(A & 0x80, !A, c);			\
+  }						\
+  next();
+
+#define lsr(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    byte b= getMemory(ea);			\
+    int  c= b & 1;				\
+    fetch();					\
+    b >>= 1;					\
+    putMemory(ea, b);				\
+    setNZC(0, !b, c);				\
+  }						\
+  next();
+
+#define lsra(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    int c= A & 1;				\
+    A >>= 1;					\
+    setNZC(0, !A, c);				\
+  }						\
+  next();
+
+#define rol(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    word b= (getMemory(ea) << 1) | getC();	\
+    fetch();					\
+    putMemory(ea, b);				\
+    setNZC(b & 0x80, !(b & 0xFF), b >> 8);	\
+  }						\
+  next();
+
+#define rola(ticks, adrmode)			\
+  tick(ticks);					\
+  fetch();					\
+  {						\
+    word b= (A << 1) | getC();			\
+    A= b;					\
+    setNZC(A & 0x80, !A, b >> 8);		\
+  }						\
+  next();
+
+#define ror(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    int  c= getC();				\
+    byte m= getMemory(ea);			\
+    byte b= (c << 7) | (m >> 1);		\
+    fetch();					\
+    putMemory(ea, b);				\
+    setNZC(b & 0x80, !b, m & 1);		\
+  }						\
+  next();
+
+#define rora(ticks, adrmode)			\
+  adrmode(ticks);				\
+  {						\
+    int ci= getC();				\
+    int co= A & 1;				\
+    fetch();					\
+    A= (ci << 7) | (A >> 1);			\
+    setNZC(A & 0x80, !A, co);			\
+  }						\
+  next();
+
+#define tRS(ticks, adrmode, R, S)		\
+  fetch();					\
+  tick(ticks);					\
+  S= R;						\
+  setNZ(S & 0x80, !S);				\
+  next();
+
+#define tax(ticks, adrmode)	tRS(ticks, adrmode, A, X)
+#define txa(ticks, adrmode)	tRS(ticks, adrmode, X, A)
+#define tay(ticks, adrmode)	tRS(ticks, adrmode, A, Y)
+#define tya(ticks, adrmode)	tRS(ticks, adrmode, Y, A)
+#define tsx(ticks, adrmode)	tRS(ticks, adrmode, S, X)
+
+#define txs(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  S= X;						\
+  next();
+
+#define ldR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  R= getMemory(ea);				\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define lda(ticks, adrmode)	ldR(ticks, adrmode, A)
+#define ldx(ticks, adrmode)	ldR(ticks, adrmode, X)
+#define ldy(ticks, adrmode)	ldR(ticks, adrmode, Y)
+
+#define stR(ticks, adrmode, R)			\
+  adrmode(ticks);				\
+  fetch();					\
+  putMemory(ea, R);				\
+  next();
+
+#define sta(ticks, adrmode)	stR(ticks, adrmode, A)
+#define stx(ticks, adrmode)	stR(ticks, adrmode, X)
+#define sty(ticks, adrmode)	stR(ticks, adrmode, Y)
+#define stz(ticks, adrmode)	stR(ticks, adrmode, 0)
+
+/* We only set keep_running to false if we branch; this is just
+ * an attempt to pick points to JIT at which we have a chance of
+ * hitting a second time. 
+ */
+#define branch(ticks, adrmode, cond)		\
+  if (cond)					\
+    {						\
+      adrmode(ticks);				\
+      PC += ea;					\
+      tick(1);					\
+      keep_running= (instructions_left > 0);    \
+    }						\
+  else						\
+    {						\
+      tick(ticks);				\
+      PC++;					\
+    }						\
+  fetch();					\
+  next();
+
+#define bcc(ticks, adrmode)	branch(ticks, adrmode, !getC())
+#define bcs(ticks, adrmode)	branch(ticks, adrmode,  getC())
+#define bne(ticks, adrmode)	branch(ticks, adrmode, !getZ())
+#define beq(ticks, adrmode)	branch(ticks, adrmode,  getZ())
+#define bpl(ticks, adrmode)	branch(ticks, adrmode, !getN())
+#define bmi(ticks, adrmode)	branch(ticks, adrmode,  getN())
+#define bvc(ticks, adrmode)	branch(ticks, adrmode, !getV())
+#define bvs(ticks, adrmode)	branch(ticks, adrmode,  getV())
+
+#define bra(ticks, adrmode)			\
+  adrmode(ticks);				\
+  PC += ea;					\
+  keep_running= (instructions_left > 0);        \
+  fetch();					\
+  tick(1);					\
+  next();
+
+#define jmp(ticks, adrmode)					\
+  {								\
+      adrmode(ticks);						\
+      byte opcode= mpu->memory[PC-3];                          	\
+      PC= ea;							\
+      if (mpu->callbacks->call[ea])				\
+	{							\
+	  word addr;						\
+	  externalise();					\
+	  if ((addr= mpu->callbacks->call[ea](mpu, ea, opcode)))\
+	    {							\
+	      internalise();					\
+	      PC= addr;						\
+	    }							\
+	}							\
+      keep_running= (instructions_left > 0);        		\
+      fetch();							\
+      next();							\
+  }
+
+#define jsr(ticks, adrmode)				\
+  PC++;							\
+  push(PC >> 8);					\
+  push(PC & 0xff);					\
+  PC--;							\
+  adrmode(ticks);					\
+  if (mpu->callbacks->call[ea])				\
+    {							\
+      word addr;					\
+      externalise();					\
+      if ((addr= mpu->callbacks->call[ea](mpu, ea, 0x20))) \
+	{						\
+	  internalise();				\
+	  PC= addr;					\
+  	  keep_running= (instructions_left > 0);       	\
+	  fetch();					\
+	  next();					\
+	}						\
+    }							\
+  PC=ea;						\
+  keep_running= (instructions_left > 0);        	\
+  fetch();						\
+  next();
+
+#define rts(ticks, adrmode)			\
+  tick(ticks);					\
+  PC  =  pop();					\
+  PC |= (pop() << 8);				\
+  PC++;						\
+  keep_running= (instructions_left > 0);       	\
+  fetch();					\
+  next();
+
+#define brk(ticks, adrmode)					\
+  tick(ticks);							\
+  PC++;								\
+  push(PC >> 8);						\
+  push(PC & 0xff);						\
+  P |= flagB;							\
+  /* http://www.6502.org/tutorials/65c02opcodes.html - unlike
+   * the 6502, the 65C02 clears D on BRK.
+   */								\
+  P &= ~flagD;                                                  \
+  push(P | flagX);						\
+  P |= flagI;							\
+  {								\
+    word hdlr= getMemory(0xfffe) + (getMemory(0xffff) << 8);	\
+    if (mpu->callbacks->call[hdlr])				\
+      {								\
+	word addr;						\
+	externalise();						\
+	if ((addr= mpu->callbacks->call[hdlr](mpu, PC - 2, 0)))	\
+	  {							\
+	    internalise();					\
+	    hdlr= addr;						\
+	  }							\
+      }								\
+    PC= hdlr;							\
+  }								\
+  keep_running= (instructions_left > 0);       			\
+  fetch();							\
+  next();
+
+#define rti(ticks, adrmode)			\
+  tick(ticks);					\
+  P=     pop();					\
+  PC=    pop();					\
+  PC |= (pop() << 8);				\
+  keep_running= (instructions_left > 0);       	\
+  fetch();					\
+  next();
+
+#define nop(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  next();
+
+#define ill(ticks, adrmode)								\
+  {											\
+    word addr= PC-1;									\
+    byte instruction= memory[addr];							\
+    tick(ticks);									\
+    if (mpu->callbacks->illegal_instruction[instruction])				\
+      {											\
+	adrmode(ticks);									\
+	externalise();									\
+        if (addr= (mpu->callbacks->illegal_instruction[instruction](mpu, addr,          \
+								    instruction)))      \
+          {										\
+	    mpu->registers->pc= addr;							\
+          }										\
+	internalise();									\
+        fetch();									\
+	next();										\
+      }											\
+    else										\
+      {											\
+        adrmode(ticks);                                                                 \
+        fetch();                                                                        \
+        next();                                                                         \
+      }											\
+  };
+
+#define phR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  push(R);					\
+  next();
+
+#define pha(ticks, adrmode)	phR(ticks, adrmode, A)
+#define phx(ticks, adrmode)	phR(ticks, adrmode, X)
+#define phy(ticks, adrmode)	phR(ticks, adrmode, Y)
+#define php(ticks, adrmode)	phR(ticks, adrmode, P | flagX | flagB)
+
+#define plR(ticks, adrmode, R)			\
+  fetch();					\
+  tick(ticks);					\
+  R= pop();					\
+  setNZ(R & 0x80, !R);				\
+  next();
+
+#define pla(ticks, adrmode)	plR(ticks, adrmode, A)
+#define plx(ticks, adrmode)	plR(ticks, adrmode, X)
+#define ply(ticks, adrmode)	plR(ticks, adrmode, Y)
+
+#define plp(ticks, adrmode)			\
+  fetch();					\
+  tick(ticks);					\
+  P= pop();					\
+  next();
+
+#define clF(ticks, adrmode, F)			\
+  fetch();					\
+  tick(ticks);					\
+  P &= ~F;					\
+  next();
+
+#define clc(ticks, adrmode)	clF(ticks, adrmode, flagC)
+#define cld(ticks, adrmode)	clF(ticks, adrmode, flagD)
+#define cli(ticks, adrmode)	clF(ticks, adrmode, flagI)
+#define clv(ticks, adrmode)	clF(ticks, adrmode, flagV)
+
+#define seF(ticks, adrmode, F)			\
+  fetch();					\
+  tick(ticks);					\
+  P |= F;					\
+  next();
+
+#define sec(ticks, adrmode)	seF(ticks, adrmode, flagC)
+#define sed(ticks, adrmode)	seF(ticks, adrmode, flagD)
+#define sei(ticks, adrmode)	seF(ticks, adrmode, flagI)
+
+#define do_insns(_)												\
+  _(00, brk, implied,   7);  _(01, ora, indx,      6);  _(02, ill, zp,        2);  _(03, ill, implied, 2);      \
+  _(04, tsb, zp,        3);  _(05, ora, zp,        3);  _(06, asl, zp,        5);  _(07, ill, implied, 2);      \
+  _(08, php, implied,   3);  _(09, ora, immediate, 3);  _(0a, asla,implied,   2);  _(0b, ill, implied, 2);      \
+  _(0c, tsb, abs,       4);  _(0d, ora, abs,       4);  _(0e, asl, abs,       6);  _(0f, ill, implied, 2);      \
+  _(10, bpl, relative,  2);  _(11, ora, indy,      5);  _(12, ora, indzp,     3);  _(13, ill, implied, 2);      \
+  _(14, trb, zp,        3);  _(15, ora, zpx,       4);  _(16, asl, zpx,       6);  _(17, ill, implied, 2);      \
+  _(18, clc, implied,   2);  _(19, ora, absy,      4);  _(1a, ina, implied,   2);  _(1b, ill, implied, 2);      \
+  _(1c, trb, abs,       4);  _(1d, ora, absx,      4);  _(1e, asl, absx,      7);  _(1f, ill, implied, 2);      \
+  _(20, jsr, abs,       6);  _(21, and, indx,      6);  _(22, ill, zp,        2);  _(23, ill, implied, 2);      \
+  _(24, bit, zp,        3);  _(25, and, zp,        3);  _(26, rol, zp,        5);  _(27, ill, implied, 2);      \
+  _(28, plp, implied,   4);  _(29, and, immediate, 3);  _(2a, rola,implied,   2);  _(2b, ill, implied, 2);      \
+  _(2c, bit, abs,       4);  _(2d, and, abs,       4);  _(2e, rol, abs,       6);  _(2f, ill, implied, 2);      \
+  _(30, bmi, relative,  2);  _(31, and, indy,      5);  _(32, and, indzp,     3);  _(33, ill, implied, 2);      \
+  _(34, bit, zpx,       4);  _(35, and, zpx,       4);  _(36, rol, zpx,       6);  _(37, ill, implied, 2);      \
+  _(38, sec, implied,   2);  _(39, and, absy,      4);  _(3a, dea, implied,   2);  _(3b, ill, implied, 2);      \
+  _(3c, bit, absx,      4);  _(3d, and, absx,      4);  _(3e, rol, absx,      7);  _(3f, ill, implied, 2);      \
+  _(40, rti, implied,   6);  _(41, eor, indx,      6);  _(42, ill, zp,        2);  _(43, ill, implied, 2);      \
+  _(44, ill, zp,        3);  _(45, eor, zp,        3);  _(46, lsr, zp,        5);  _(47, ill, implied, 2);      \
+  _(48, pha, implied,   3);  _(49, eor, immediate, 3);  _(4a, lsra,implied,   2);  _(4b, ill, implied, 2);      \
+  _(4c, jmp, abs,       3);  _(4d, eor, abs,       4);  _(4e, lsr, abs,       6);  _(4f, ill, implied, 2);      \
+  _(50, bvc, relative,  2);  _(51, eor, indy,      5);  _(52, eor, indzp,     3);  _(53, ill, implied, 2);      \
+  _(54, ill, zp,        4);  _(55, eor, zpx,       4);  _(56, lsr, zpx,       6);  _(57, ill, implied, 2);      \
+  _(58, cli, implied,   2);  _(59, eor, absy,      4);  _(5a, phy, implied,   3);  _(5b, ill, implied, 2);      \
+  _(5c, ill, abs,       8);  _(5d, eor, absx,      4);  _(5e, lsr, absx,      7);  _(5f, ill, implied, 2);      \
+  _(60, rts, implied,   6);  _(61, adc, indx,      6);  _(62, ill, zp,        2);  _(63, ill, implied, 2);      \
+  _(64, stz, zp,        3);  _(65, adc, zp,        3);  _(66, ror, zp,        5);  _(67, ill, implied, 2);      \
+  _(68, pla, implied,   4);  _(69, adc, immediate, 3);  _(6a, rora,implied,   2);  _(6b, ill, implied, 2);      \
+  _(6c, jmp, indirect,  5);  _(6d, adc, abs,       4);  _(6e, ror, abs,       6);  _(6f, ill, implied, 2);      \
+  _(70, bvs, relative,  2);  _(71, adc, indy,      5);  _(72, adc, indzp,     3);  _(73, ill, implied, 2);      \
+  _(74, stz, zpx,       4);  _(75, adc, zpx,       4);  _(76, ror, zpx,       6);  _(77, ill, implied, 2);      \
+  _(78, sei, implied,   2);  _(79, adc, absy,      4);  _(7a, ply, implied,   4);  _(7b, ill, implied, 2);      \
+  _(7c, jmp, indabsx,   6);  _(7d, adc, absx,      4);  _(7e, ror, absx,      7);  _(7f, ill, implied, 2);      \
+  _(80, bra, relative,  2);  _(81, sta, indx,      6);  _(82, ill, zp,        2);  _(83, ill, implied, 2);      \
+  _(84, sty, zp,        2);  _(85, sta, zp,        2);  _(86, stx, zp,        2);  _(87, ill, implied, 2);      \
+  _(88, dey, implied,   2);  _(89, bim, immediate, 2);  _(8a, txa, implied,   2);  _(8b, ill, implied, 2);      \
+  _(8c, sty, abs,       4);  _(8d, sta, abs,       4);  _(8e, stx, abs,       4);  _(8f, ill, implied, 2);      \
+  _(90, bcc, relative,  2);  _(91, sta, indy,      6);  _(92, sta, indzp,     3);  _(93, ill, implied, 2);      \
+  _(94, sty, zpx,       4);  _(95, sta, zpx,       4);  _(96, stx, zpy,       4);  _(97, ill, implied, 2);      \
+  _(98, tya, implied,   2);  _(99, sta, absy,      5);  _(9a, txs, implied,   2);  _(9b, ill, implied, 2);      \
+  _(9c, stz, abs,       4);  _(9d, sta, absx,      5);  _(9e, stz, absx,      5);  _(9f, ill, implied, 2);      \
+  _(a0, ldy, immediate, 3);  _(a1, lda, indx,      6);  _(a2, ldx, immediate, 3);  _(a3, ill, implied, 2);      \
+  _(a4, ldy, zp,        3);  _(a5, lda, zp,        3);  _(a6, ldx, zp,        3);  _(a7, ill, implied, 2);      \
+  _(a8, tay, implied,   2);  _(a9, lda, immediate, 3);  _(aa, tax, implied,   2);  _(ab, ill, implied, 2);      \
+  _(ac, ldy, abs,       4);  _(ad, lda, abs,       4);  _(ae, ldx, abs,       4);  _(af, ill, implied, 2);      \
+  _(b0, bcs, relative,  2);  _(b1, lda, indy,      5);  _(b2, lda, indzp,     3);  _(b3, ill, implied, 2);      \
+  _(b4, ldy, zpx,       4);  _(b5, lda, zpx,       4);  _(b6, ldx, zpy,       4);  _(b7, ill, implied, 2);      \
+  _(b8, clv, implied,   2);  _(b9, lda, absy,      4);  _(ba, tsx, implied,   2);  _(bb, ill, implied, 2);      \
+  _(bc, ldy, absx,      4);  _(bd, lda, absx,      4);  _(be, ldx, absy,      4);  _(bf, ill, implied, 2);      \
+  _(c0, cpy, immediate, 3);  _(c1, cmp, indx,      6);  _(c2, ill, zp,        2);  _(c3, ill, implied, 2);      \
+  _(c4, cpy, zp,        3);  _(c5, cmp, zp,        3);  _(c6, dec, zp,        5);  _(c7, ill, implied, 2);      \
+  _(c8, iny, implied,   2);  _(c9, cmp, immediate, 3);  _(ca, dex, implied,   2);  _(cb, ill, implied, 2);      \
+  _(cc, cpy, abs,       4);  _(cd, cmp, abs,       4);  _(ce, dec, abs,       6);  _(cf, ill, implied, 2);      \
+  _(d0, bne, relative,  2);  _(d1, cmp, indy,      5);  _(d2, cmp, indzp,     3);  _(d3, ill, implied, 2);      \
+  _(d4, ill, zp,        4);  _(d5, cmp, zpx,       4);  _(d6, dec, zpx,       6);  _(d7, ill, implied, 2);      \
+  _(d8, cld, implied,   2);  _(d9, cmp, absy,      4);  _(da, phx, implied,   3);  _(db, ill, implied, 2);      \
+  _(dc, ill, abs,       4);  _(dd, cmp, absx,      4);  _(de, dec, absx,      7);  _(df, ill, implied, 2);      \
+  _(e0, cpx, immediate, 3);  _(e1, sbc, indx,      6);  _(e2, ill, zp,        2);  _(e3, ill, implied, 2);      \
+  _(e4, cpx, zp,        3);  _(e5, sbc, zp,        3);  _(e6, inc, zp,        5);  _(e7, ill, implied, 2);      \
+  _(e8, inx, implied,   2);  _(e9, sbc, immediate, 3);  _(ea, nop, implied,   2);  _(eb, ill, implied, 2);      \
+  _(ec, cpx, abs,       4);  _(ed, sbc, abs,       4);  _(ee, inc, abs,       6);  _(ef, ill, implied, 2);      \
+  _(f0, beq, relative,  2);  _(f1, sbc, indy,      5);  _(f2, sbc, indzp,     3);  _(f3, ill, implied, 2);      \
+  _(f4, ill, zp,        4);  _(f5, sbc, zpx,       4);  _(f6, inc, zpx,       6);  _(f7, ill, implied, 2);      \
+  _(f8, sed, implied,   2);  _(f9, sbc, absy,      4);  _(fa, plx, implied,   4);  _(fb, ill, implied, 2);      \
+  _(fc, ill, abs,       4);  _(fd, sbc, absx,      4);  _(fe, inc, absx,      7);  _(ff, ill, implied, 2);
+
+
+
+void M6502_irq(M6502 *mpu)
+{
+  if (!(mpu->registers->p & flagI))
+    {
+      mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc >> 8);
+      mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc & 0xff);
+      mpu->memory[0x0100 + mpu->registers->s--] = mpu->registers->p;
+      mpu->registers->p &= ~flagB;
+      mpu->registers->p |=  flagI;
+      mpu->registers->pc = M6502_getVector(mpu, IRQ);
+    }
+}
+
+
+void M6502_nmi(M6502 *mpu)
+{
+  mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc >> 8);
+  mpu->memory[0x0100 + mpu->registers->s--] = (byte)(mpu->registers->pc & 0xff);
+  mpu->memory[0x0100 + mpu->registers->s--] = mpu->registers->p;
+  mpu->registers->p &= ~flagB;
+  mpu->registers->p |=  flagI;
+  mpu->registers->pc = M6502_getVector(mpu, NMI);
+}
+
+
+void M6502_reset(M6502 *mpu)
+{
+  mpu->registers->p &= ~flagD;
+  mpu->registers->p |=  flagI;
+  mpu->registers->pc = M6502_getVector(mpu, RST);
+}
+
+
+/* the compiler should elminate all call to this function */
+
+static void oops(void)
+{
+  fprintf(stderr, "\noops -- instruction dispatch missing\n");
+}
+
+
+void M6502_run_interpreted(M6502 *mpu, int instructions_left)
+{
+  int keep_running= 1;
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+
+  static void *itab[256]= { &&_00, &&_01, &&_02, &&_03, &&_04, &&_05, &&_06, &&_07, &&_08, &&_09, &&_0a, &&_0b, &&_0c, &&_0d, &&_0e, &&_0f,
+			    &&_10, &&_11, &&_12, &&_13, &&_14, &&_15, &&_16, &&_17, &&_18, &&_19, &&_1a, &&_1b, &&_1c, &&_1d, &&_1e, &&_1f,
+			    &&_20, &&_21, &&_22, &&_23, &&_24, &&_25, &&_26, &&_27, &&_28, &&_29, &&_2a, &&_2b, &&_2c, &&_2d, &&_2e, &&_2f,
+			    &&_30, &&_31, &&_32, &&_33, &&_34, &&_35, &&_36, &&_37, &&_38, &&_39, &&_3a, &&_3b, &&_3c, &&_3d, &&_3e, &&_3f,
+			    &&_40, &&_41, &&_42, &&_43, &&_44, &&_45, &&_46, &&_47, &&_48, &&_49, &&_4a, &&_4b, &&_4c, &&_4d, &&_4e, &&_4f,
+			    &&_50, &&_51, &&_52, &&_53, &&_54, &&_55, &&_56, &&_57, &&_58, &&_59, &&_5a, &&_5b, &&_5c, &&_5d, &&_5e, &&_5f,
+			    &&_60, &&_61, &&_62, &&_63, &&_64, &&_65, &&_66, &&_67, &&_68, &&_69, &&_6a, &&_6b, &&_6c, &&_6d, &&_6e, &&_6f,
+			    &&_70, &&_71, &&_72, &&_73, &&_74, &&_75, &&_76, &&_77, &&_78, &&_79, &&_7a, &&_7b, &&_7c, &&_7d, &&_7e, &&_7f,
+			    &&_80, &&_81, &&_82, &&_83, &&_84, &&_85, &&_86, &&_87, &&_88, &&_89, &&_8a, &&_8b, &&_8c, &&_8d, &&_8e, &&_8f,
+			    &&_90, &&_91, &&_92, &&_93, &&_94, &&_95, &&_96, &&_97, &&_98, &&_99, &&_9a, &&_9b, &&_9c, &&_9d, &&_9e, &&_9f,
+			    &&_a0, &&_a1, &&_a2, &&_a3, &&_a4, &&_a5, &&_a6, &&_a7, &&_a8, &&_a9, &&_aa, &&_ab, &&_ac, &&_ad, &&_ae, &&_af,
+			    &&_b0, &&_b1, &&_b2, &&_b3, &&_b4, &&_b5, &&_b6, &&_b7, &&_b8, &&_b9, &&_ba, &&_bb, &&_bc, &&_bd, &&_be, &&_bf,
+			    &&_c0, &&_c1, &&_c2, &&_c3, &&_c4, &&_c5, &&_c6, &&_c7, &&_c8, &&_c9, &&_ca, &&_cb, &&_cc, &&_cd, &&_ce, &&_cf,
+			    &&_d0, &&_d1, &&_d2, &&_d3, &&_d4, &&_d5, &&_d6, &&_d7, &&_d8, &&_d9, &&_da, &&_db, &&_dc, &&_dd, &&_de, &&_df,
+			    &&_e0, &&_e1, &&_e2, &&_e3, &&_e4, &&_e5, &&_e6, &&_e7, &&_e8, &&_e9, &&_ea, &&_eb, &&_ec, &&_ed, &&_ee, &&_ef,
+			    &&_f0, &&_f1, &&_f2, &&_f3, &&_f4, &&_f5, &&_f6, &&_f7, &&_f8, &&_f9, &&_fa, &&_fb, &&_fc, &&_fd, &&_fe, &&_ff };
+
+  register void **itabp= &itab[0];
+  register void  *tpc;
+
+# define begin()				++instructions_left;  fetch();  next()
+# define fetch()				tpc= itabp[memory[PC++]]
+# define next()					--instructions_left;  if (keep_running) goto *tpc; else goto done
+# define dispatch(num, name, mode, cycles)	_##num: name(cycles, mode) oops();  next()
+# define end()					done: --PC
+
+#else /* (!__GNUC__) || (__STRICT_ANSI__) */
+
+# define begin()				for (;keep_running;--instructions_left) switch (memory[PC++]) {
+# define fetch()
+# define next()					break
+# define dispatch(num, name, mode, cycles)	case 0x##num: name(cycles, mode);  next()
+# define end()					}
+
+#endif
+
+  register byte  *memory= mpu->memory;
+  register word   PC;
+  word		  ea;
+  byte		  A, X, Y, P, S;
+  M6502_Callback *readCallback=  mpu->callbacks->read;
+  M6502_Callback *writeCallback= mpu->callbacks->write;
+
+# define internalise()	A= mpu->registers->a;  X= mpu->registers->x;  Y= mpu->registers->y;  P= mpu->registers->p;  S= mpu->registers->s;  PC= mpu->registers->pc
+# define externalise()	mpu->registers->a= A;  mpu->registers->x= X;  mpu->registers->y= Y;  mpu->registers->p= P;  mpu->registers->s= S;  mpu->registers->pc= PC
+
+  internalise();
+
+  begin();
+  do_insns(dispatch);
+  end();
+
+  externalise();
+
+# undef begin
+# undef internalise
+# undef externalise
+# undef fetch
+# undef next
+# undef dispatch
+# undef end
+}
+
+
+int M6502_disassemble(M6502 *mpu, word ip, char buffer[64])
+{
+  char *s= buffer;
+  byte *b= mpu->memory + ip;
+
+  switch (b[0])
+    {
+#    define _implied							    return 1;
+#    define _immediate	sprintf(s, "#%02X",	   b[1]);		    return 2;
+#    define _zp		sprintf(s, "%02X",	   b[1]);		    return 2;
+#    define _zpx	sprintf(s, "%02X,X",	   b[1]);		    return 2;
+#    define _zpy	sprintf(s, "%02X,Y",	   b[1]);		    return 2;
+#    define _abs	sprintf(s, "%02X%02X",	   b[2], b[1]);		    return 3;
+#    define _absx	sprintf(s, "%02X%02X,X",   b[2], b[1]);		    return 3;
+#    define _absy	sprintf(s, "%02X%02X,Y",   b[2], b[1]);		    return 3;
+#    define _relative	sprintf(s, "%04X",	   ip + 2 + (int8_t)b[1]);  return 2;
+#    define _indirect	sprintf(s, "(%02X%02X)",   b[2], b[1]);		    return 3;
+#    define _indzp	sprintf(s, "(%02X)",	   b[1]);		    return 2;
+#    define _indx	sprintf(s, "(%02X,X)",	   b[1]);		    return 2;
+#    define _indy	sprintf(s, "(%02X),Y",	   b[1]);		    return 2;
+#    define _indabsx	sprintf(s, "(%02X%02X,X)", b[2], b[1]);		    return 3;
+
+#    define disassemble(num, name, mode, cycles) case 0x##num: s += sprintf(s, "%s ", #name); _##mode
+      do_insns(disassemble);
+#    undef _do
+    }
+
+  return 0;
+}
+
+
+void M6502_dump(M6502 *mpu, char buffer[64])
+{
+  M6502_Registers *r= mpu->registers;
+  uint8_t p= r->p;
+# define P(N,C) (p & (1 << (N)) ? (C) : '-')
+  sprintf(buffer, "PC=%04X SP=%04X A=%02X X=%02X Y=%02X P=%02X %c%c%c%c%c%c%c%c",
+	  r->pc, 0x0100 + r->s,
+	  r->a, r->x, r->y, r->p,
+	  P(7,'N'), P(6,'V'), P(5,'?'), P(4,'B'), P(3,'D'), P(2,'I'), P(1,'Z'), P(0,'C'));
+# undef P
+}
diff --git a/lib6502.h b/lib6502.h
new file mode 100644
index 0000000..41fc9f2
--- /dev/null
+++ b/lib6502.h
@@ -0,0 +1,120 @@
+/* lib6502.h -- MOS Technology 6502 emulator	-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef __m6502_h
+#define __m6502_h
+
+#include <stdio.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+  extern "C"
+{
+#endif
+
+typedef struct _M6502		M6502;
+typedef struct _M6502_Registers	M6502_Registers;
+typedef struct _M6502_Callbacks	M6502_Callbacks;
+typedef struct _M6502_Internal  M6502_Internal;
+
+typedef int   (*M6502_Callback)(M6502 *mpu, uint16_t address, uint8_t data);
+
+typedef M6502_Callback	M6502_CallbackTable[0x10000];
+typedef M6502_Callback	M6502_IllegalInstructionCallbackTable[0x100];
+typedef uint8_t		M6502_Memory[0x10000];
+
+enum {
+  M6502_NMIVector= 0xfffa,  M6502_NMIVectorLSB= 0xfffa,  M6502_NMIVectorMSB= 0xfffb,
+  M6502_RSTVector= 0xfffc,  M6502_RSTVectorLSB= 0xfffc,  M6502_RSTVectorMSB= 0xfffd,
+  M6502_IRQVector= 0xfffe,  M6502_IRQVectorLSB= 0xfffe,  M6502_IRQVectorMSB= 0xffff
+};
+
+struct _M6502_Registers
+{
+  uint8_t   a;	/* accumulator */
+  uint8_t   x;	/* X index register */
+  uint8_t   y;	/* Y index register */
+  uint8_t   p;	/* processor status register */
+  uint8_t   s;	/* stack pointer */
+  uint16_t pc;	/* program counter */
+};
+
+struct _M6502_Callbacks
+{
+  M6502_CallbackTable read;
+  M6502_CallbackTable write;
+  M6502_CallbackTable call;
+  M6502_IllegalInstructionCallbackTable illegal_instruction;
+};
+
+struct _M6502_Internal;
+
+struct _M6502
+{
+  M6502_Registers *registers;
+  uint8_t	  *memory;
+  M6502_Callbacks *callbacks;
+  unsigned int	   flags;
+
+  /* The following is implementation-specific; client code should only use the
+   * above members.
+   */
+  M6502_Internal  *internal;
+};
+
+enum {
+  M6502_RegistersAllocated = 1 << 0,
+  M6502_MemoryAllocated    = 1 << 1,
+  M6502_CallbacksAllocated = 1 << 2
+};
+
+typedef enum {
+  M6502_ModeInterpreted,
+  M6502_ModeCompiled,
+  M6502_ModeHybrid
+} M6502_Mode;
+
+extern M6502 *M6502_new(M6502_Registers *registers, M6502_Memory memory, M6502_Callbacks *callbacks);
+extern void   M6502_reset(M6502 *mpu);
+extern void   M6502_nmi(M6502 *mpu);
+extern void   M6502_irq(M6502 *mpu);
+extern void   M6502_run(M6502 *mpu);
+extern int    M6502_disassemble(M6502 *mpu, uint16_t addr, char buffer[64]);
+extern void   M6502_dump(M6502 *mpu, char buffer[64]);
+extern void   M6502_delete(M6502 *mpu);
+extern void   M6502_setMode(M6502 *mpu, M6502_Mode mode, int arg);
+
+#define M6502_getVector(MPU, VEC)			\
+  ( ( ((MPU)->memory[M6502_##VEC##VectorLSB]) )		\
+    | ((MPU)->memory[M6502_##VEC##VectorMSB] << 8) )
+
+#define M6502_setVector(MPU, VEC, ADDR)						\
+  ( ( ((MPU)->memory[M6502_##VEC##VectorLSB]= ((uint8_t)(ADDR)) & 0xff) )	\
+    , ((MPU)->memory[M6502_##VEC##VectorMSB]= (uint8_t)((ADDR) >> 8)) )
+
+#define M6502_getCallback(MPU, TYPE, ADDR)	((MPU)->callbacks->TYPE[ADDR])
+#define M6502_setCallback(MPU, TYPE, ADDR, FN)	((MPU)->callbacks->TYPE[ADDR]= (FN))
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __m6502_h */
diff --git a/m4/boost.m4 b/m4/boost.m4
new file mode 100644
index 0000000..0a46b0e
--- /dev/null
+++ b/m4/boost.m4
@@ -0,0 +1,1338 @@
+# boost.m4: Locate Boost headers and libraries for autoconf-based projects.
+# Copyright (C) 2007-2011, 2014  Benoit Sigoure <tsuna@lrde.epita.fr>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Additional permission under section 7 of the GNU General Public
+# License, version 3 ("GPLv3"):
+#
+# If you convey this file as part of a work that contains a
+# configuration script generated by Autoconf, you may do so under
+# terms of your choice.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+m4_define([_BOOST_SERIAL], [m4_translit([
+# serial 22
+], [#
+], [])])
+
+# Original sources can be found at http://github.com/tsuna/boost.m4
+# You can fetch the latest version of the script by doing:
+#   wget http://github.com/tsuna/boost.m4/raw/master/build-aux/boost.m4
+
+# ------ #
+# README #
+# ------ #
+
+# This file provides several macros to use the various Boost libraries.
+# The first macro is BOOST_REQUIRE.  It will simply check if it's possible to
+# find the Boost headers of a given (optional) minimum version and it will
+# define BOOST_CPPFLAGS accordingly.  It will add an option --with-boost to
+# your configure so that users can specify non standard locations.
+# If the user's environment contains BOOST_ROOT and --with-boost was not
+# specified, --with-boost=$BOOST_ROOT is implicitly used.
+# For more README and documentation, go to http://github.com/tsuna/boost.m4
+# Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL.  If you don't, don't worry,
+# simply read the README, it will show you what to do step by step.
+
+m4_pattern_forbid([^_?(BOOST|Boost)_])
+
+
+# _BOOST_SED_CPP(SED-PROGRAM, PROGRAM,
+#                [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# --------------------------------------------------------
+# Same as AC_EGREP_CPP, but leave the result in conftest.i.
+#
+# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP.  It is expanded
+# in double-quotes, so escape your double quotes.
+#
+# It could be useful to turn this into a macro which extracts the
+# value of any macro.
+m4_define([_BOOST_SED_CPP],
+[AC_LANG_PUSH([C++])dnl
+AC_LANG_PREPROC_REQUIRE()dnl
+AC_REQUIRE([AC_PROG_SED])dnl
+AC_LANG_CONFTEST([AC_LANG_SOURCE([[$2]])])
+AS_IF([dnl eval is necessary to expand ac_cpp.
+dnl Ultrix and Pyramid sh refuse to redirect output of eval, so use subshell.
+dnl Beware of Windows end-of-lines, for instance if we are running
+dnl some Windows programs under Wine.  In that case, boost/version.hpp
+dnl is certainly using "\r\n", but the regular Unix shell will only
+dnl strip `\n' with backquotes, not the `\r'.  This results in
+dnl boost_cv_lib_version='1_37\r' for instance, which breaks
+dnl everything else.
+dnl Cannot use 'dnl' after [$4] because a trailing dnl may break AC_CACHE_CHECK
+(eval "$ac_cpp conftest.$ac_ext") 2>&AS_MESSAGE_LOG_FD |
+  tr -d '\r' |
+  $SED -n -e "$1" >conftest.i 2>&1],
+  [$3],
+  [$4])
+rm -rf conftest*
+AC_LANG_POP([C++])dnl
+])# _BOOST_SED_CPP
+
+
+
+# BOOST_REQUIRE([VERSION], [ACTION-IF-NOT-FOUND])
+# -----------------------------------------------
+# Look for Boost.  If version is given, it must either be a literal of the form
+# "X.Y.Z" where X, Y and Z are integers (the ".Z" part being optional) or a
+# variable "$var".
+# Defines the value BOOST_CPPFLAGS.  This macro only checks for headers with
+# the required version, it does not check for any of the Boost libraries.
+# On # success, defines HAVE_BOOST.  On failure, calls the optional
+# ACTION-IF-NOT-FOUND action if one was supplied.
+# Otherwise aborts with an error message.
+AC_DEFUN([BOOST_REQUIRE],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_PROG_GREP])dnl
+echo "$as_me: this is boost.m4[]_BOOST_SERIAL" >&AS_MESSAGE_LOG_FD
+boost_save_IFS=$IFS
+boost_version_req=$1
+IFS=.
+set x $boost_version_req 0 0 0
+IFS=$boost_save_IFS
+shift
+boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"`
+boost_version_req_string=$[1].$[2].$[3]
+AC_ARG_WITH([boost],
+   [AS_HELP_STRING([--with-boost=DIR],
+                   [prefix of Boost $1 @<:@guess@:>@])])dnl
+AC_ARG_VAR([BOOST_ROOT],[Location of Boost installation])dnl
+# If BOOST_ROOT is set and the user has not provided a value to
+# --with-boost, then treat BOOST_ROOT as if it the user supplied it.
+if test x"$BOOST_ROOT" != x; then
+  if test x"$with_boost" = x; then
+    AC_MSG_NOTICE([Detected BOOST_ROOT; continuing with --with-boost=$BOOST_ROOT])
+    with_boost=$BOOST_ROOT
+  else
+    AC_MSG_NOTICE([Detected BOOST_ROOT=$BOOST_ROOT, but overridden by --with-boost=$with_boost])
+  fi
+fi
+AC_SUBST([DISTCHECK_CONFIGURE_FLAGS],
+         ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+  AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string],
+    [boost_cv_inc_path],
+    [boost_cv_inc_path=no
+AC_LANG_PUSH([C++])dnl
+m4_pattern_allow([^BOOST_VERSION$])dnl
+    AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <boost/version.hpp>
+#if !defined BOOST_VERSION
+# error BOOST_VERSION is not defined
+#elif BOOST_VERSION < $boost_version_req
+# error Boost headers version < $boost_version_req
+#endif
+]])])
+    # If the user provided a value to --with-boost, use it and only it.
+    case $with_boost in #(
+      ''|yes) set x '' /opt/local/include /usr/local/include /opt/include \
+                 /usr/include C:/Boost/include;; #(
+      *)      set x "$with_boost/include" "$with_boost";;
+    esac
+    shift
+    for boost_dir
+    do
+    # Without --layout=system, Boost (or at least some versions) installs
+    # itself in <prefix>/include/boost-<version>.  This inner loop helps to
+    # find headers in such directories.
+    #
+    # Any ${boost_dir}/boost-x_xx directories are searched in reverse version
+    # order followed by ${boost_dir}.  The final '.' is a sentinel for
+    # searching $boost_dir" itself.  Entries are whitespace separated.
+    #
+    # I didn't indent this loop on purpose (to avoid over-indented code)
+    boost_layout_system_search_list=`cd "$boost_dir" 2>/dev/null \
+        && ls -1 | "${GREP}" '^boost-' | sort -rn -t- -k2 \
+        && echo .`
+    for boost_inc in $boost_layout_system_search_list
+    do
+      if test x"$boost_inc" != x.; then
+        boost_inc="$boost_dir/$boost_inc"
+      else
+        boost_inc="$boost_dir" # Uses sentinel in boost_layout_system_search_list
+      fi
+      if test x"$boost_inc" != x; then
+        # We are going to check whether the version of Boost installed
+        # in $boost_inc is usable by running a compilation that
+        # #includes it.  But if we pass a -I/some/path in which Boost
+        # is not installed, the compiler will just skip this -I and
+        # use other locations (either from CPPFLAGS, or from its list
+        # of system include directories).  As a result we would use
+        # header installed on the machine instead of the /some/path
+        # specified by the user.  So in that precise case (trying
+        # $boost_inc), make sure the version.hpp exists.
+        #
+        # Use test -e as there can be symlinks.
+        test -e "$boost_inc/boost/version.hpp" || continue
+        CPPFLAGS="$CPPFLAGS -I$boost_inc"
+      fi
+      AC_COMPILE_IFELSE([], [boost_cv_inc_path=yes], [boost_cv_version=no])
+      if test x"$boost_cv_inc_path" = xyes; then
+        if test x"$boost_inc" != x; then
+          boost_cv_inc_path=$boost_inc
+        fi
+        break 2
+      fi
+    done
+    done
+AC_LANG_POP([C++])dnl
+    ])
+    case $boost_cv_inc_path in #(
+      no)
+        boost_errmsg="cannot find Boost headers version >= $boost_version_req_string"
+        m4_if([$2], [],  [AC_MSG_ERROR([$boost_errmsg])],
+                        [AC_MSG_NOTICE([$boost_errmsg])])
+        $2
+        ;;#(
+      yes)
+        BOOST_CPPFLAGS=
+        ;;#(
+      *)
+        AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl
+        ;;
+    esac
+  if test x"$boost_cv_inc_path" != xno; then
+  AC_DEFINE([HAVE_BOOST], [1],
+            [Defined if the requested minimum BOOST version is satisfied])
+  AC_CACHE_CHECK([for Boost's header version],
+    [boost_cv_lib_version],
+    [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl
+     _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}],
+                    [#include <boost/version.hpp>
+boost-lib-version = BOOST_LIB_VERSION],
+    [boost_cv_lib_version=`cat conftest.i`])])
+    # e.g. "134" for 1_34_1 or "135" for 1_35
+    boost_major_version=`echo "$boost_cv_lib_version" | sed 's/_//;s/_.*//'`
+    case $boost_major_version in #(
+      '' | *[[!0-9]]*)
+        AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version])
+        ;;
+    esac
+fi
+CPPFLAGS=$boost_save_CPPFLAGS
+])# BOOST_REQUIRE
+
+
+# BOOST_STATIC()
+# --------------
+# Add the "--enable-static-boost" configure argument. If this argument is given
+# on the command line, static versions of the libraries will be looked up.
+AC_DEFUN([BOOST_STATIC],
+  [AC_ARG_ENABLE([static-boost],
+     [AS_HELP_STRING([--enable-static-boost],
+               [Prefer the static boost libraries over the shared ones [no]])],
+     [enable_static_boost=yes],
+     [enable_static_boost=no])])# BOOST_STATIC
+
+
+# BOOST_FIND_HEADER([HEADER-NAME], [ACTION-IF-NOT-FOUND], [ACTION-IF-FOUND])
+# --------------------------------------------------------------------------
+# Wrapper around AC_CHECK_HEADER for Boost headers.  Useful to check for
+# some parts of the Boost library which are only made of headers and don't
+# require linking (such as Boost.Foreach).
+#
+# Default ACTION-IF-NOT-FOUND: Fail with a fatal error unless Boost couldn't be
+# found in the first place, in which case by default a notice is issued to the
+# user.  Presumably if we haven't died already it's because it's OK to not have
+# Boost, which is why only a notice is issued instead of a hard error.
+#
+# Default ACTION-IF-FOUND: define the preprocessor symbol HAVE_<HEADER-NAME> in
+# case of success # (where HEADER-NAME is written LIKE_THIS, e.g.,
+# HAVE_BOOST_FOREACH_HPP).
+AC_DEFUN([BOOST_FIND_HEADER],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  m4_default([$2], [AC_MSG_NOTICE([Boost not available, not searching for $1])])
+else
+AC_LANG_PUSH([C++])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CHECK_HEADER([$1],
+  [m4_default([$3], [AC_DEFINE(AS_TR_CPP([HAVE_$1]), [1],
+                               [Define to 1 if you have <$1>])])],
+  [m4_default([$2], [AC_MSG_ERROR([cannot find $1])])])
+CPPFLAGS=$boost_save_CPPFLAGS
+AC_LANG_POP([C++])dnl
+fi
+])# BOOST_FIND_HEADER
+
+
+# BOOST_FIND_LIBS([COMPONENT-NAME], [CANDIDATE-LIB-NAMES],
+#                 [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                 [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Look for the Boost library COMPONENT-NAME (e.g., `thread', for
+# libboost_thread) under the possible CANDIDATE-LIB-NAMES (e.g.,
+# "thread_win32 thread").  Check that HEADER-NAME works and check that
+# libboost_LIB-NAME can link with the code CXX-TEST.  The optional
+# argument CXX-PROLOGUE can be used to include some C++ code before
+# the `main' function.
+#
+# Invokes BOOST_FIND_HEADER([HEADER-NAME]) (see above).
+#
+# Boost libraries typically come compiled with several flavors (with different
+# runtime options) so PREFERRED-RT-OPT is the preferred suffix.  A suffix is one
+# or more of the following letters: sgdpn (in that order).  s = static
+# runtime, d = debug build, g = debug/diagnostic runtime, p = STLPort build,
+# n = (unsure) STLPort build without iostreams from STLPort (it looks like `n'
+# must always be used along with `p').  Additionally, PREFERRED-RT-OPT can
+# start with `mt-' to indicate that there is a preference for multi-thread
+# builds.  Some sample values for PREFERRED-RT-OPT: (nothing), mt, d, mt-d, gdp
+# ...  If you want to make sure you have a specific version of Boost
+# (eg, >= 1.33) you *must* invoke BOOST_REQUIRE before this macro.
+AC_DEFUN([BOOST_FIND_LIBS],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+AC_REQUIRE([_BOOST_FIND_COMPILER_TAG])dnl
+AC_REQUIRE([BOOST_STATIC])dnl
+AC_REQUIRE([_BOOST_GUESS_WHETHER_TO_USE_MT])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  AC_MSG_NOTICE([Boost not available, not searching for the Boost $1 library])
+else
+dnl The else branch is huge and wasn't intended on purpose.
+AC_LANG_PUSH([C++])dnl
+AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl
+AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl
+BOOST_FIND_HEADER([$4])
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CACHE_CHECK([for the Boost $1 library], [Boost_lib],
+               [_BOOST_FIND_LIBS($@)])
+case $Boost_lib in #(
+  (no) _AC_MSG_LOG_CONFTEST
+    AC_MSG_ERROR([cannot find the flags to link with Boost $1])
+    ;;
+esac
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl
+AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl
+CPPFLAGS=$boost_save_CPPFLAGS
+AS_VAR_POPDEF([Boost_lib])dnl
+AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl
+AS_VAR_POPDEF([Boost_lib_LDPATH])dnl
+AS_VAR_POPDEF([Boost_lib_LIBS])dnl
+AC_LANG_POP([C++])dnl
+fi
+])
+
+
+# BOOST_FIND_LIB([LIB-NAME],
+#                [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Backward compatibility wrapper for BOOST_FIND_LIBS.
+AC_DEFUN([BOOST_FIND_LIB],
+[BOOST_FIND_LIBS([$1], $@)])
+
+
+# _BOOST_FIND_LIBS([LIB-NAME], [CANDIDATE-LIB-NAMES],
+#                 [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                 [CXX-PROLOGUE])
+# --------------------------------------------------------------
+# Real implementation of BOOST_FIND_LIBS: rely on these local macros:
+# Boost_lib, Boost_lib_LDFLAGS, Boost_lib_LDPATH, Boost_lib_LIBS
+#
+# The algorithm is as follows: first look for a given library name
+# according to the user's PREFERRED-RT-OPT.  For each library name, we
+# prefer to use the ones that carry the tag (toolset name).  Each
+# library is searched through the various standard paths were Boost is
+# usually installed.  If we can't find the standard variants, we try
+# to enforce -mt (for instance on MacOSX, libboost_thread.dylib
+# doesn't exist but there's -obviously- libboost_thread-mt.dylib).
+AC_DEFUN([_BOOST_FIND_LIBS],
+[Boost_lib=no
+  case "$3" in #(
+    (mt | mt-) boost_mt=-mt; boost_rtopt=;; #(
+    (mt* | mt-*) boost_mt=-mt; boost_rtopt=`expr "X$3" : 'Xmt-*\(.*\)'`;; #(
+    (*) boost_mt=; boost_rtopt=$3;;
+  esac
+  if test $enable_static_boost = yes; then
+    boost_rtopt="s$boost_rtopt"
+  fi
+  # Find the proper debug variant depending on what we've been asked to find.
+  case $boost_rtopt in #(
+    (*d*) boost_rt_d=$boost_rtopt;; #(
+    (*[[sgpn]]*) # Insert the `d' at the right place (in between `sg' and `pn')
+      boost_rt_d=`echo "$boost_rtopt" | sed 's/\(s*g*\)\(p*n*\)/\1\2/'`;; #(
+    (*) boost_rt_d='-d';;
+  esac
+  # If the PREFERRED-RT-OPT are not empty, prepend a `-'.
+  test -n "$boost_rtopt" && boost_rtopt="-$boost_rtopt"
+  $boost_guess_use_mt && boost_mt=-mt
+  # Look for the abs path the static archive.
+  # $libext is computed by Libtool but let's make sure it's non empty.
+  test -z "$libext" &&
+    AC_MSG_ERROR([the libext variable is empty, did you invoke Libtool?])
+  boost_save_ac_objext=$ac_objext
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <$4>
+$6], [$5])])
+dnl Optimization hacks: compiling C++ is slow, especially with Boost.  What
+dnl we're trying to do here is guess the right combination of link flags
+dnl (LIBS / LDFLAGS) to use a given library.  This can take several
+dnl iterations before it succeeds and is thus *very* slow.  So what we do
+dnl instead is that we compile the code first (and thus get an object file,
+dnl typically conftest.o).  Then we try various combinations of link flags
+dnl until we succeed to link conftest.o in an executable.  The problem is
+dnl that the various TRY_LINK / COMPILE_IFELSE macros of Autoconf always
+dnl remove all the temporary files including conftest.o.  So the trick here
+dnl is to temporarily change the value of ac_objext so that conftest.o is
+dnl preserved accross tests.  This is obviously fragile and I will burn in
+dnl hell for not respecting Autoconf's documented interfaces, but in the
+dnl mean time, it optimizes the macro by a factor of 5 to 30.
+dnl Another small optimization: the first argument of AC_COMPILE_IFELSE left
+dnl empty because the test file is generated only once above (before we
+dnl start the for loops).
+  AC_COMPILE_IFELSE([],
+    [ac_objext=do_not_rm_me_plz],
+    [AC_MSG_ERROR([cannot compile a test that uses Boost $1])])
+  ac_objext=$boost_save_ac_objext
+  boost_failed_libs=
+# Don't bother to ident the following nested for loops, only the 2
+# innermost ones matter.
+for boost_lib_ in $2; do
+for boost_tag_ in -$boost_cv_lib_tag ''; do
+for boost_ver_ in -$boost_cv_lib_version ''; do
+for boost_mt_ in $boost_mt -mt ''; do
+for boost_rtopt_ in $boost_rtopt '' -d; do
+  for boost_lib in \
+    boost_$boost_lib_$boost_tag_$boost_mt_$boost_rtopt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_rtopt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_mt_$boost_ver_ \
+    boost_$boost_lib_$boost_tag_$boost_ver_
+  do
+    # Avoid testing twice the same lib
+    case $boost_failed_libs in #(
+      (*@$boost_lib@*) continue;;
+    esac
+    # If with_boost is empty, we'll search in /lib first, which is not quite
+    # right so instead we'll try to a location based on where the headers are.
+    boost_tmp_lib=$with_boost
+    test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include}
+    for boost_ldpath in "$boost_tmp_lib/lib" '' \
+             /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \
+             "$with_boost" C:/Boost/lib /lib*
+    do
+      # Don't waste time with directories that don't exist.
+      if test x"$boost_ldpath" != x && test ! -e "$boost_ldpath"; then
+        continue
+      fi
+      boost_save_LDFLAGS=$LDFLAGS
+      # Are we looking for a static library?
+      case $boost_ldpath:$boost_rtopt_ in #(
+        (*?*:*s*) # Yes (Non empty boost_ldpath + s in rt opt)
+          Boost_lib_LIBS="$boost_ldpath/lib$boost_lib.$libext"
+          test -e "$Boost_lib_LIBS" || continue;; #(
+        (*) # No: use -lboost_foo to find the shared library.
+          Boost_lib_LIBS="-l$boost_lib";;
+      esac
+      boost_save_LIBS=$LIBS
+      LIBS="$Boost_lib_LIBS $LIBS"
+      test x"$boost_ldpath" != x && LDFLAGS="$LDFLAGS -L$boost_ldpath"
+dnl First argument of AC_LINK_IFELSE left empty because the test file is
+dnl generated only once above (before we start the for loops).
+      _BOOST_AC_LINK_IFELSE([],
+                            [Boost_lib=yes], [Boost_lib=no])
+      ac_objext=$boost_save_ac_objext
+      LDFLAGS=$boost_save_LDFLAGS
+      LIBS=$boost_save_LIBS
+      if test x"$Boost_lib" = xyes; then
+        # Check or used cached result of whether or not using -R or
+        # -rpath makes sense.  Some implementations of ld, such as for
+        # Mac OSX, require -rpath but -R is the flag known to work on
+        # other systems.  https://github.com/tsuna/boost.m4/issues/19
+        AC_CACHE_VAL([boost_cv_rpath_link_ldflag],
+          [case $boost_ldpath in
+           '') # Nothing to do.
+             boost_cv_rpath_link_ldflag=
+             boost_rpath_link_ldflag_found=yes;;
+           *)
+            for boost_cv_rpath_link_ldflag in -Wl,-R, -Wl,-rpath,; do
+              LDFLAGS="$boost_save_LDFLAGS -L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+              LIBS="$boost_save_LIBS $Boost_lib_LIBS"
+              _BOOST_AC_LINK_IFELSE([],
+                [boost_rpath_link_ldflag_found=yes
+                break],
+                [boost_rpath_link_ldflag_found=no])
+            done
+            ;;
+          esac
+          AS_IF([test "x$boost_rpath_link_ldflag_found" != "xyes"],
+            [AC_MSG_ERROR([Unable to determine whether to use -R or -rpath])])
+          LDFLAGS=$boost_save_LDFLAGS
+          LIBS=$boost_save_LIBS
+          ])
+        test x"$boost_ldpath" != x &&
+          Boost_lib_LDFLAGS="-L$boost_ldpath $boost_cv_rpath_link_ldflag$boost_ldpath"
+        Boost_lib_LDPATH="$boost_ldpath"
+        break 7
+      else
+        boost_failed_libs="$boost_failed_libs@$boost_lib@"
+      fi
+    done
+  done
+done
+done
+done
+done
+done # boost_lib_
+rm -f conftest.$ac_objext
+])
+
+
+
+# --------------------------------------- #
+# Checks for the various Boost libraries. #
+# --------------------------------------- #
+
+# List of boost libraries: http://www.boost.org/libs/libraries.htm
+# The page http://beta.boost.org/doc/libs is useful: it gives the first release
+# version of each library (among other things).
+
+# BOOST_DEFUN(LIBRARY, CODE)
+# --------------------------
+# Define BOOST_<LIBRARY-UPPERCASE> as a macro that runs CODE.
+#
+# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN.
+m4_define([BOOST_DEFUN],
+[m4_indir([AC_DEFUN],
+          m4_toupper([BOOST_$1]),
+[m4_pushdef([BOOST_Library], [$1])dnl
+$2
+m4_popdef([BOOST_Library])dnl
+])
+])
+
+# BOOST_ARRAY()
+# -------------
+# Look for Boost.Array
+BOOST_DEFUN([Array],
+[BOOST_FIND_HEADER([boost/array.hpp])])
+
+
+# BOOST_ASIO()
+# ------------
+# Look for Boost.Asio (new in Boost 1.35).
+BOOST_DEFUN([Asio],
+[AC_REQUIRE([BOOST_SYSTEM])dnl
+BOOST_FIND_HEADER([boost/asio.hpp])])
+
+
+# BOOST_BIND()
+# ------------
+# Look for Boost.Bind.
+BOOST_DEFUN([Bind],
+[BOOST_FIND_HEADER([boost/bind.hpp])])
+
+
+# BOOST_CHRONO()
+# --------------
+# Look for Boost.Chrono.
+BOOST_DEFUN([Chrono],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+  BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([chrono], [$1],
+                [boost/chrono.hpp],
+                [boost::chrono::thread_clock d;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+  BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_CHRONO
+
+
+# BOOST_CONVERSION()
+# ------------------
+# Look for Boost.Conversion (cast / lexical_cast)
+BOOST_DEFUN([Conversion],
+[BOOST_FIND_HEADER([boost/cast.hpp])
+BOOST_FIND_HEADER([boost/lexical_cast.hpp])
+])# BOOST_CONVERSION
+
+
+# BOOST_CRC()
+# -----------
+# Look for Boost.CRC
+BOOST_DEFUN([CRC],
+[BOOST_FIND_HEADER([boost/crc.hpp])
+])# BOOST_CRC
+
+
+# BOOST_DATE_TIME([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Date_Time.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Date_Time],
+[BOOST_FIND_LIB([date_time], [$1],
+                [boost/date_time/posix_time/posix_time.hpp],
+                [boost::posix_time::ptime t;])
+])# BOOST_DATE_TIME
+
+
+# BOOST_FILESYSTEM([PREFERRED-RT-OPT])
+# ------------------------------------
+# Look for Boost.Filesystem.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+# Do not check for boost/filesystem.hpp because this file was introduced in
+# 1.34.
+BOOST_DEFUN([Filesystem],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+  BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([filesystem], [$1],
+                [boost/filesystem/path.hpp], [boost::filesystem::path p;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+  BOOST_FILESYSTEM_LIBS="$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_FILESYSTEM
+
+
+# BOOST_FLYWEIGHT()
+# -----------------
+# Look for Boost.Flyweight.
+BOOST_DEFUN([Flyweight],
+[dnl There's a hidden dependency on pthreads.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+BOOST_FIND_HEADER([boost/flyweight.hpp])
+AC_SUBST([BOOST_FLYWEIGHT_LIBS], [$boost_cv_pthread_flag])
+])
+
+
+# BOOST_FOREACH()
+# ---------------
+# Look for Boost.Foreach.
+BOOST_DEFUN([Foreach],
+[BOOST_FIND_HEADER([boost/foreach.hpp])])
+
+
+# BOOST_FORMAT()
+# --------------
+# Look for Boost.Format.
+# Note: we can't check for boost/format/format_fwd.hpp because the header isn't
+# standalone.  It can't be compiled because it triggers the following error:
+# boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std'
+#                                                  does not name a type
+BOOST_DEFUN([Format],
+[BOOST_FIND_HEADER([boost/format.hpp])])
+
+
+# BOOST_FUNCTION()
+# ----------------
+# Look for Boost.Function
+BOOST_DEFUN([Function],
+[BOOST_FIND_HEADER([boost/function.hpp])])
+
+
+# BOOST_GEOMETRY()
+# ----------------
+# Look for Boost.Geometry (new since 1.47.0).
+BOOST_DEFUN([Geometry],
+[BOOST_FIND_HEADER([boost/geometry.hpp])
+])# BOOST_GEOMETRY
+
+
+# BOOST_GRAPH([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Graphs.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Graph],
+[BOOST_FIND_LIB([graph], [$1],
+                [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;])
+])# BOOST_GRAPH
+
+
+# BOOST_IOSTREAMS([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.IOStreams.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([IOStreams],
+[BOOST_FIND_LIB([iostreams], [$1],
+                [boost/iostreams/device/file_descriptor.hpp],
+                [boost::iostreams::file_descriptor fd; fd.close();])
+])# BOOST_IOSTREAMS
+
+
+# BOOST_HASH()
+# ------------
+# Look for Boost.Functional/Hash
+BOOST_DEFUN([Hash],
+[BOOST_FIND_HEADER([boost/functional/hash.hpp])])
+
+
+# BOOST_LAMBDA()
+# --------------
+# Look for Boost.Lambda
+BOOST_DEFUN([Lambda],
+[BOOST_FIND_HEADER([boost/lambda/lambda.hpp])])
+
+
+# BOOST_LOG([PREFERRED-RT-OPT])
+# -----------------------------
+# Look for Boost.Log.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log],
+[BOOST_FIND_LIB([log], [$1],
+    [boost/log/core/core.hpp],
+    [boost::log::attribute a; a.get_value();])
+])# BOOST_LOG
+
+
+# BOOST_LOG_SETUP([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Log.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log_Setup],
+[AC_REQUIRE([BOOST_LOG])dnl
+BOOST_FIND_LIB([log_setup], [$1],
+    [boost/log/utility/setup/from_settings.hpp],
+    [boost::log::basic_settings<char> bs; bs.empty();])
+])# BOOST_LOG_SETUP
+
+
+# BOOST_MATH()
+# ------------
+# Look for Boost.Math
+# TODO: This library isn't header-only but it comes in multiple different
+# flavors that don't play well with BOOST_FIND_LIB (e.g, libboost_math_c99,
+# libboost_math_c99f, libboost_math_c99l, libboost_math_tr1,
+# libboost_math_tr1f, libboost_math_tr1l).  This macro must be fixed to do the
+# right thing anyway.
+BOOST_DEFUN([Math],
+[BOOST_FIND_HEADER([boost/math/special_functions.hpp])])
+
+
+# BOOST_MPI([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost MPI.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.  Uses MPICXX variable if it is
+# set, otherwise tries CXX
+#
+BOOST_DEFUN([MPI],
+[boost_save_CXX=${CXX}
+boost_save_CXXCPP=${CXXCPP}
+if test x"${MPICXX}" != x; then
+  CXX=${MPICXX}
+  CXXCPP="${MPICXX} -E"
+fi
+BOOST_FIND_LIB([mpi], [$1],
+               [boost/mpi.hpp],
+               [int argc = 0;
+                char **argv = 0;
+                boost::mpi::environment env(argc,argv);])
+CXX=${boost_save_CXX}
+CXXCPP=${boost_save_CXXCPP}
+])# BOOST_MPI
+
+
+# BOOST_MULTIARRAY()
+# ------------------
+# Look for Boost.MultiArray
+BOOST_DEFUN([MultiArray],
+[BOOST_FIND_HEADER([boost/multi_array.hpp])])
+
+
+# BOOST_NUMERIC_UBLAS()
+# --------------------------
+# Look for Boost.NumericUblas (Basic Linear Algebra)
+BOOST_DEFUN([Numeric_Ublas],
+[BOOST_FIND_HEADER([boost/numeric/ublas/vector.hpp])
+])# BOOST_NUMERIC_UBLAS
+
+
+# BOOST_NUMERIC_CONVERSION()
+# --------------------------
+# Look for Boost.NumericConversion (policy-based numeric conversion)
+BOOST_DEFUN([Numeric_Conversion],
+[BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp])
+])# BOOST_NUMERIC_CONVERSION
+
+
+# BOOST_OPTIONAL()
+# ----------------
+# Look for Boost.Optional
+BOOST_DEFUN([Optional],
+[BOOST_FIND_HEADER([boost/optional.hpp])])
+
+
+# BOOST_PREPROCESSOR()
+# --------------------
+# Look for Boost.Preprocessor
+BOOST_DEFUN([Preprocessor],
+[BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])])
+
+
+# BOOST_UNORDERED()
+# -----------------
+# Look for Boost.Unordered
+BOOST_DEFUN([Unordered],
+[BOOST_FIND_HEADER([boost/unordered_map.hpp])])
+
+
+# BOOST_UUID()
+# ------------
+# Look for Boost.Uuid
+BOOST_DEFUN([Uuid],
+[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])])
+
+
+# BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT])
+# -----------------------------------------
+# Look for Boost.Program_options.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Program_Options],
+[BOOST_FIND_LIB([program_options], [$1],
+                [boost/program_options.hpp],
+                [boost::program_options::options_description d("test");])
+])# BOOST_PROGRAM_OPTIONS
+
+
+
+# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG)
+# ------------------------------------
+# Save VARIABLE, and define it via `python-config --FLAG`.
+# Substitute BOOST_PYTHON_VARIABLE.
+m4_define([_BOOST_PYTHON_CONFIG],
+[AC_SUBST([BOOST_PYTHON_$1],
+          [`python-config --$2 2>/dev/null`])dnl
+boost_python_save_$1=$$1
+$1="$$1 $BOOST_PYTHON_$1"])
+
+
+# BOOST_PYTHON([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.Python.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Python],
+[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes])
+_BOOST_PYTHON_CONFIG([LDFLAGS],   [ldflags])
+_BOOST_PYTHON_CONFIG([LIBS],      [libs])
+m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl
+BOOST_FIND_LIBS([python], [python python3], [$1],
+                [boost/python.hpp],
+                [], [BOOST_PYTHON_MODULE(empty) {}])
+CPPFLAGS=$boost_python_save_CPPFLAGS
+LDFLAGS=$boost_python_save_LDFLAGS
+LIBS=$boost_python_save_LIBS
+])# BOOST_PYTHON
+
+
+# BOOST_REF()
+# -----------
+# Look for Boost.Ref
+BOOST_DEFUN([Ref],
+[BOOST_FIND_HEADER([boost/ref.hpp])])
+
+
+# BOOST_REGEX([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Regex.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Regex],
+[BOOST_FIND_LIB([regex], [$1],
+                [boost/regex.hpp],
+                [boost::regex exp("*"); boost::regex_match("foo", exp);])
+])# BOOST_REGEX
+
+
+# BOOST_SERIALIZATION([PREFERRED-RT-OPT])
+# ---------------------------------------
+# Look for Boost.Serialization.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Serialization],
+[BOOST_FIND_LIB([serialization], [$1],
+                [boost/archive/text_oarchive.hpp],
+                [std::ostream* o = 0; // Cheap way to get an ostream...
+                boost::archive::text_oarchive t(*o);])
+])# BOOST_SERIALIZATION
+
+
+# BOOST_SIGNALS([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Signals.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Signals],
+[BOOST_FIND_LIB([signals], [$1],
+                [boost/signal.hpp],
+                [boost::signal<void ()> s;])
+])# BOOST_SIGNALS
+
+
+# BOOST_SIGNALS2()
+# ----------------
+# Look for Boost.Signals2 (new since 1.39.0).
+BOOST_DEFUN([Signals2],
+[BOOST_FIND_HEADER([boost/signals2.hpp])
+])# BOOST_SIGNALS2
+
+
+# BOOST_SMART_PTR()
+# -----------------
+# Look for Boost.SmartPtr
+BOOST_DEFUN([Smart_Ptr],
+[BOOST_FIND_HEADER([boost/scoped_ptr.hpp])
+BOOST_FIND_HEADER([boost/shared_ptr.hpp])
+])
+
+
+# BOOST_STATICASSERT()
+# --------------------
+# Look for Boost.StaticAssert
+BOOST_DEFUN([StaticAssert],
+[BOOST_FIND_HEADER([boost/static_assert.hpp])])
+
+
+# BOOST_STRING_ALGO()
+# -------------------
+# Look for Boost.StringAlgo
+BOOST_DEFUN([String_Algo],
+[BOOST_FIND_HEADER([boost/algorithm/string.hpp])
+])
+
+
+# BOOST_SYSTEM([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.System.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.  This library was introduced in Boost
+# 1.35.0.
+BOOST_DEFUN([System],
+[BOOST_FIND_LIB([system], [$1],
+                [boost/system/error_code.hpp],
+                [boost::system::error_code e; e.clear();])
+])# BOOST_SYSTEM
+
+
+# BOOST_TEST([PREFERRED-RT-OPT])
+# ------------------------------
+# Look for Boost.Test.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Test],
+[m4_pattern_allow([^BOOST_CHECK$])dnl
+BOOST_FIND_LIB([unit_test_framework], [$1],
+               [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);],
+               [using boost::unit_test::test_suite;
+               test_suite* init_unit_test_suite(int argc, char ** argv)
+               { return NULL; }])
+])# BOOST_TEST
+
+
+# BOOST_THREAD([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Thread.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Thread],
+[dnl Having the pthread flag is required at least on GCC3 where
+dnl boost/thread.hpp would complain if we try to compile without
+dnl -pthread on GNU/Linux.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+boost_thread_save_LIBS=$LIBS
+boost_thread_save_LDFLAGS=$LDFLAGS
+boost_thread_save_CPPFLAGS=$CPPFLAGS
+# Link-time dependency from thread to system was added as of 1.49.0.
+if test $boost_major_version -ge 149; then
+BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag"
+
+# When compiling for the Windows platform, the threads library is named
+# differently.
+case $host_os in
+  (*mingw*) boost_thread_lib_ext=_win32;;
+esac
+BOOST_FIND_LIBS([thread], [thread$boost_thread_lib_ext],
+                [$1],
+                [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+
+BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $BOOST_SYSTEM_LIBS $boost_cv_pthread_flag"
+BOOST_THREAD_LDFLAGS="$BOOST_SYSTEM_LDFLAGS"
+BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag"
+LIBS=$boost_thread_save_LIBS
+LDFLAGS=$boost_thread_save_LDFLAGS
+CPPFLAGS=$boost_thread_save_CPPFLAGS
+])# BOOST_THREAD
+
+AU_ALIAS([BOOST_THREADS], [BOOST_THREAD])
+
+
+# BOOST_TOKENIZER()
+# -----------------
+# Look for Boost.Tokenizer
+BOOST_DEFUN([Tokenizer],
+[BOOST_FIND_HEADER([boost/tokenizer.hpp])])
+
+
+# BOOST_TRIBOOL()
+# ---------------
+# Look for Boost.Tribool
+BOOST_DEFUN([Tribool],
+[BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp])
+BOOST_FIND_HEADER([boost/logic/tribool.hpp])
+])
+
+
+# BOOST_TUPLE()
+# -------------
+# Look for Boost.Tuple
+BOOST_DEFUN([Tuple],
+[BOOST_FIND_HEADER([boost/tuple/tuple.hpp])])
+
+
+# BOOST_TYPETRAITS()
+# --------------------
+# Look for Boost.TypeTraits
+BOOST_DEFUN([TypeTraits],
+[BOOST_FIND_HEADER([boost/type_traits.hpp])])
+
+
+# BOOST_UTILITY()
+# ---------------
+# Look for Boost.Utility (noncopyable, result_of, base-from-member idiom,
+# etc.)
+BOOST_DEFUN([Utility],
+[BOOST_FIND_HEADER([boost/utility.hpp])])
+
+
+# BOOST_VARIANT()
+# ---------------
+# Look for Boost.Variant.
+BOOST_DEFUN([Variant],
+[BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp])
+BOOST_FIND_HEADER([boost/variant.hpp])])
+
+
+# BOOST_POINTER_CONTAINER()
+# ------------------------
+# Look for Boost.PointerContainer
+BOOST_DEFUN([Pointer_Container],
+[BOOST_FIND_HEADER([boost/ptr_container/ptr_deque.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_list.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_vector.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_array.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_set.hpp])
+BOOST_FIND_HEADER([boost/ptr_container/ptr_map.hpp])
+])# BOOST_POINTER_CONTAINER
+
+
+# BOOST_WAVE([PREFERRED-RT-OPT])
+# ------------------------------
+# NOTE: If you intend to use Wave/Spirit with thread support, make sure you
+# call BOOST_THREAD first.
+# Look for Boost.Wave.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Wave],
+[AC_REQUIRE([BOOST_FILESYSTEM])dnl
+AC_REQUIRE([BOOST_DATE_TIME])dnl
+boost_wave_save_LIBS=$LIBS
+boost_wave_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS \
+$BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS \
+$BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS"
+BOOST_FIND_LIB([wave], [$1],
+                [boost/wave.hpp],
+                [boost::wave::token_id id; get_token_name(id);])
+LIBS=$boost_wave_save_LIBS
+LDFLAGS=$boost_wave_save_LDFLAGS
+])# BOOST_WAVE
+
+
+# BOOST_XPRESSIVE()
+# -----------------
+# Look for Boost.Xpressive (new since 1.36.0).
+BOOST_DEFUN([Xpressive],
+[BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])])
+
+
+# ----------------- #
+# Internal helpers. #
+# ----------------- #
+
+
+# _BOOST_PTHREAD_FLAG()
+# ---------------------
+# Internal helper for BOOST_THREAD.  Computes boost_cv_pthread_flag
+# which must be used in CPPFLAGS and LIBS.
+#
+# Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3,
+# boost/thread.hpp will trigger a #error if -pthread isn't used:
+#   boost/config/requires_threads.hpp:47:5: #error "Compiler threading support
+#   is not turned on. Please set the correct command line options for
+#   threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)"
+#
+# Based on ACX_PTHREAD: http://autoconf-archive.cryp.to/acx_pthread.html
+AC_DEFUN([_BOOST_PTHREAD_FLAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_LANG_PUSH([C++])dnl
+AC_CACHE_CHECK([for the flags needed to use pthreads], [boost_cv_pthread_flag],
+[ boost_cv_pthread_flag=
+  # The ordering *is* (sometimes) important.  Some notes on the
+  # individual items follow:
+  # (none): in case threads are in libc; should be tried before -Kthread and
+  #       other compiler flags to prevent continual compiler warnings
+  # -lpthreads: AIX (must check this before -lpthread)
+  # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+  # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+  # -llthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+  # -pthread: GNU Linux/GCC (kernel threads), BSD/GCC (userland threads)
+  # -pthreads: Solaris/GCC
+  # -mthreads: MinGW32/GCC, Lynx/GCC
+  # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+  #      doesn't hurt to check since this sometimes defines pthreads too;
+  #      also defines -D_REENTRANT)
+  #      ... -mt is also the pthreads flag for HP/aCC
+  # -lpthread: GNU Linux, etc.
+  # --thread-safe: KAI C++
+  case $host_os in #(
+    *solaris*)
+      # On Solaris (at least, for some versions), libc contains stubbed
+      # (non-functional) versions of the pthreads routines, so link-based
+      # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+      # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+      # a function called by this macro, so we could check for that, but
+      # who knows whether they'll stub that too in a future libc.)  So,
+      # we'll just look for -pthreads and -lpthread first:
+      boost_pthread_flags="-pthreads -lpthread -mt -pthread";; #(
+    *)
+      boost_pthread_flags="-lpthreads -Kthread -kthread -llthread -pthread \
+                           -pthreads -mthreads -lpthread --thread-safe -mt";;
+  esac
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <pthread.h>],
+    [pthread_t th; pthread_join(th, 0);
+    pthread_attr_init(0); pthread_cleanup_push(0, 0);
+    pthread_create(0,0,0,0); pthread_cleanup_pop(0);])])
+  for boost_pthread_flag in '' $boost_pthread_flags; do
+    boost_pthread_ok=false
+dnl Re-use the test file already generated.
+    boost_pthreads__save_LIBS=$LIBS
+    LIBS="$LIBS $boost_pthread_flag"
+    AC_LINK_IFELSE([],
+      [if grep ".*$boost_pthread_flag" conftest.err; then
+         echo "This flag seems to have triggered warnings" >&AS_MESSAGE_LOG_FD
+       else
+         boost_pthread_ok=:; boost_cv_pthread_flag=$boost_pthread_flag
+       fi])
+    LIBS=$boost_pthreads__save_LIBS
+    $boost_pthread_ok && break
+  done
+])
+AC_LANG_POP([C++])dnl
+])# _BOOST_PTHREAD_FLAG
+
+
+# _BOOST_gcc_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_gcc_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC @ gcc$1$2"])dnl
+
+# _BOOST_mingw_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_mingw_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC && \
+  (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+         || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw$1$2"])dnl
+
+
+# _BOOST_FIND_COMPILER_TAG()
+# --------------------------
+# Internal.  When Boost is installed without --layout=system, each library
+# filename will hold a suffix that encodes the compiler used during the
+# build.  The Boost build system seems to call this a `tag'.
+AC_DEFUN([_BOOST_FIND_COMPILER_TAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag],
+[boost_cv_lib_tag=unknown
+if test x$boost_cv_inc_path != xno; then
+  AC_LANG_PUSH([C++])dnl
+  # The following tests are mostly inspired by boost/config/auto_link.hpp
+  # The list is sorted to most recent/common to oldest compiler (in order
+  # to increase the likelihood of finding the right compiler with the
+  # least number of compilation attempt).
+  # Beware that some tests are sensible to the order (for instance, we must
+  # look for MinGW before looking for GCC3).
+  # I used one compilation test per compiler with a #error to recognize
+  # each compiler so that it works even when cross-compiling (let me know
+  # if you know a better approach).
+  # Known missing tags (known from Boost's tools/build/v2/tools/common.jam):
+  #   como, edg, kcc, bck, mp, sw, tru, xlc
+  # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines
+  # the same defines as GCC's).
+  for i in \
+    _BOOST_mingw_test(4,8) \
+    _BOOST_gcc_test(4, 8) \
+    _BOOST_mingw_test(4,7) \
+    _BOOST_gcc_test(4, 7) \
+    _BOOST_mingw_test(4,6) \
+    _BOOST_gcc_test(4, 6) \
+    _BOOST_mingw_test(4,5) \
+    _BOOST_gcc_test(4, 5) \
+    _BOOST_mingw_test(4,4) \
+    _BOOST_gcc_test(4, 4) \
+    _BOOST_mingw_test(4,3) \
+    _BOOST_gcc_test(4, 3) \
+    _BOOST_mingw_test(4,2) \
+    _BOOST_gcc_test(4, 2) \
+    _BOOST_mingw_test(4,1) \
+    _BOOST_gcc_test(4, 1) \
+    _BOOST_mingw_test(4,0) \
+    _BOOST_gcc_test(4, 0) \
+    "defined __GNUC__ && __GNUC__ == 3 && !defined __ICC \
+     && (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+         || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw" \
+    _BOOST_gcc_test(3, 4) \
+    _BOOST_gcc_test(3, 3) \
+    "defined _MSC_VER && _MSC_VER >= 1500 @ vc90" \
+    "defined _MSC_VER && _MSC_VER == 1400 @ vc80" \
+    _BOOST_gcc_test(3, 2) \
+    "defined _MSC_VER && _MSC_VER == 1310 @ vc71" \
+    _BOOST_gcc_test(3, 1) \
+    _BOOST_gcc_test(3, 0) \
+    "defined __BORLANDC__ @ bcb" \
+    "defined __ICC && (defined __unix || defined __unix__) @ il" \
+    "defined __ICL @ iw" \
+    "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \
+    _BOOST_gcc_test(2, 95) \
+    "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \
+    "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \
+    "defined _MSC_VER && _MSC_VER < 1300 && defined UNDER_CE @ evc4" \
+    "defined __MWERKS__ && __MWERKS__ <= 0x31FF @ cw8"
+  do
+    boost_tag_test=`expr "X$i" : 'X\([[^@]]*\) @ '`
+    boost_tag=`expr "X$i" : 'X[[^@]]* @ \(.*\)'`
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if $boost_tag_test
+/* OK */
+#else
+# error $boost_tag_test
+#endif
+]])], [boost_cv_lib_tag=$boost_tag; break], [])
+  done
+AC_LANG_POP([C++])dnl
+  case $boost_cv_lib_tag in #(
+    # Some newer (>= 1.35?) versions of Boost seem to only use "gcc" as opposed
+    # to "gcc41" for instance.
+    *-gcc | *'-gcc ') :;; #(  Don't re-add -gcc: it's already in there.
+    gcc*)
+      boost_tag_x=
+      case $host_os in #(
+        darwin*)
+          if test $boost_major_version -ge 136; then
+            # The `x' added in r46793 of Boost.
+            boost_tag_x=x
+          fi;;
+      esac
+      # We can specify multiple tags in this variable because it's used by
+      # BOOST_FIND_LIB that does a `for tag in -$boost_cv_lib_tag' ...
+      boost_cv_lib_tag="$boost_tag_x$boost_cv_lib_tag -${boost_tag_x}gcc"
+      ;; #(
+    unknown)
+      AC_MSG_WARN([[could not figure out which toolset name to use for $CXX]])
+      boost_cv_lib_tag=
+      ;;
+  esac
+fi])dnl end of AC_CACHE_CHECK
+])# _BOOST_FIND_COMPILER_TAG
+
+
+# _BOOST_GUESS_WHETHER_TO_USE_MT()
+# --------------------------------
+# Compile a small test to try to guess whether we should favor MT (Multi
+# Thread) flavors of Boost.  Sets boost_guess_use_mt accordingly.
+AC_DEFUN([_BOOST_GUESS_WHETHER_TO_USE_MT],
+[# Check whether we do better use `mt' even though we weren't ask to.
+AC_LANG_PUSH([C++])dnl
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if defined _REENTRANT || defined _MT || defined __MT__
+/* use -mt */
+#else
+# error MT not needed
+#endif
+]])], [boost_guess_use_mt=:], [boost_guess_use_mt=false])
+AC_LANG_POP([C++])dnl
+])
+
+# _BOOST_AC_LINK_IFELSE(PROGRAM, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# -------------------------------------------------------------------
+# Fork of _AC_LINK_IFELSE that preserves conftest.o across calls.  Fragile,
+# will break when Autoconf changes its internals.  Requires that you manually
+# rm -f conftest.$ac_objext in between to really different tests, otherwise
+# you will try to link a conftest.o left behind by a previous test.
+# Used to aggressively optimize BOOST_FIND_LIB (see the big comment in this
+# macro).
+#
+# Don't use "break" in the actions, as it would short-circuit some code
+# this macro runs after the actions.
+m4_define([_BOOST_AC_LINK_IFELSE],
+[m4_ifvaln([$1], [AC_LANG_CONFTEST([$1])])dnl
+rm -f conftest$ac_exeext
+boost_save_ac_ext=$ac_ext
+boost_use_source=:
+# If we already have a .o, re-use it.  We change $ac_ext so that $ac_link
+# tries to link the existing object file instead of compiling from source.
+test -f conftest.$ac_objext && ac_ext=$ac_objext && boost_use_source=false &&
+  _AS_ECHO_LOG([re-using the existing conftest.$ac_objext])
+AS_IF([_AC_DO_STDERR($ac_link) && {
+         test -z "$ac_[]_AC_LANG_ABBREV[]_werror_flag" ||
+         test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+         test "$cross_compiling" = yes ||
+         $as_executable_p conftest$ac_exeext
+dnl FIXME: use AS_TEST_X instead when 2.61 is widespread enough.
+       }],
+      [$2],
+      [if $boost_use_source; then
+         _AC_MSG_LOG_CONFTEST
+       fi
+       $3])
+ac_objext=$boost_save_ac_objext
+ac_ext=$boost_save_ac_ext
+dnl Delete also the IPA/IPO (Inter Procedural Analysis/Optimization)
+dnl information created by the PGI compiler (conftest_ipa8_conftest.oo),
+dnl as it would interfere with the next link command.
+rm -f core conftest.err conftest_ipa8_conftest.oo \
+      conftest$ac_exeext m4_ifval([$1], [conftest.$ac_ext])[]dnl
+])# _BOOST_AC_LINK_IFELSE
+
+# Local Variables:
+# mode: autoconf
+# End:
diff --git a/man/M6502_delete.3 b/man/M6502_delete.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_delete.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_disassemble.3 b/man/M6502_disassemble.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_disassemble.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_dump.3 b/man/M6502_dump.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_dump.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_getCallback.3 b/man/M6502_getCallback.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_getCallback.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_getVector.3 b/man/M6502_getVector.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_getVector.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_irq.3 b/man/M6502_irq.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_irq.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_new.3 b/man/M6502_new.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_new.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_nmi.3 b/man/M6502_nmi.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_nmi.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_reset.3 b/man/M6502_reset.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_reset.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_run.3 b/man/M6502_run.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_run.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setCallback.3 b/man/M6502_setCallback.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setCallback.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setMode.3 b/man/M6502_setMode.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setMode.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/M6502_setVector.3 b/man/M6502_setVector.3
new file mode 100644
index 0000000..4bd1ff4
--- /dev/null
+++ b/man/M6502_setVector.3
@@ -0,0 +1 @@
+.so man3/lib6502.3
diff --git a/man/lib6502.3 b/man/lib6502.3
new file mode 100644
index 0000000..4551a3d
--- /dev/null
+++ b/man/lib6502.3
@@ -0,0 +1,555 @@
+.\" Copyright (c) 2005 Ian Piumarta
+.\" Copyright (c) 2014 Steven Flintham
+.\" 
+.\" Permission is hereby granted, free of charge, to any person
+.\" obtaining a copy of this software and associated documentation
+.\" files (the 'Software'), to deal in the Software without
+.\" restriction, including without limitation the rights to use, copy,
+.\" modify, merge, publish, distribute, and/or sell copies of the
+.\" Software, and to permit persons to whom the Software is furnished
+.\" to do so, provided that the above copyright notice(s) and this
+.\" permission notice appear in all copies of the Software and that
+.\" both the above copyright notice(s) and this permission notice
+.\" appear in supporting documentation.
+.\" 
+.\" THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+.\"
+.Dd June 7, 2014
+.Dt LIB6502 3 LOCAL
+.Os ""
+.\" ----------------------------------------------------------------
+.Sh NAME
+.\"
+.Nm lib6502
+.Nd 6502 microprocessor emulator
+.\" ----------------------------------------------------------------
+.Sh SYNOPSIS
+.\"
+.In stdint.h
+.In lib6502.h
+.Ft M6502 *
+.Fn M6502_new "M6502_Registers *registers" "M6502_Memory memory" "M6502_Callbacks *callbacks"
+.Ft void
+.Fn M6502_reset "M6502 *mpu"
+.Ft void
+.Fn M6502_nmi "M6502 *mpu"
+.Ft void
+.Fn M6502_irq "M6502 *mpu"
+.Ft uint16_t
+.Fn M6502_getVector "M6502 *mpu" "vector"
+.Ft uint16_t
+.Fn M6502_setVector "M6502 *mpu" "vector" "uint16_t address"
+.Ft M6502_Callback
+.Fn M6502_getCallback "M6502 *mpu" "type" "uint16_t address"
+.Ft M6502_Callback
+.Fn M6502_setCallback "M6502 *mpu" "type" "uint16_t address" "M6502_Callback callback"
+.Ft void
+.Fn M6502_run "M6502 *mpu"
+.Ft int
+.Fn M6502_disassemble "M6502 *mpu" "uint16_t address" "char buffer[64]"
+.Ft void
+.Fn M6502_dump "M6502 *mpu" "char buffer[64]"
+.Ft void
+.Fn M6502_delete "M6502 *mpu"
+.Ft void
+.Fn M6502_setMode "M6502 *mpu" "M6502_Mode mode" "int arg"
+.\" ----------------------------------------------------------------
+.Sh DESCRIPTION
+.\"
+.Fn M6502_new
+creates an instance of a 6502 microprocessor.
+.Fn M6502_reset ,
+.Fn M6502_nmi
+and 
+.Fn M6502_irq
+place it into the states associated with the hardware signals for
+reset, non-maskable interrupt and interrupt request, respectively.
+The macros
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+read and write the vectors through which the processor jumps in
+response to the above signals.  The macros
+.Fn M6502_getCallback
+and
+.Fn M6502_setVector
+read and write client-supplied functions that intercept accesses to
+memory.
+.Fn M6502_run
+begins emulated execution.
+.Fn M6502_dump
+and
+.Fn M6502_disassemble
+create human-readable representations of processor or memory state.
+.Fn M6502_delete
+frees all resources associated with a processor instance.  
+.Fn M6502_setMode
+specifies the emulation mode to use for a processor instance.  Each of
+these functions and macros is described in more detail below.
+.Pp
+.Fn M6502_new
+returns a pointer to a
+.Fa M6502
+structure containing at least the following members:
+.Bd -literal
+struct _M6502
+{
+    M6502_Registers  *registers;   /* processor state */
+    uint8_t          *memory;      /* memory image */
+    M6502_Callbacks  *callbacks;   /* r/w/x/illegal callbacks */
+};
+.Ed
+.Pp
+These members are initialised according to the supplied
+.Fa registers ,
+.Fa memory
+and
+.Fa callbacks
+arguments.  If a given argument is NULL, the corresponding member is
+initialised automatically with a suitable (non-NULL) value.
+.Pp
+The members of
+.Fa M6502
+are as follows:
+.Bl -tag -width ".Fa callbacks"
+.It Fa registers
+the processor state, containing all registers and condition codes.
+.It Fa memory
+a block of at least 64 kilobytes of storage containing the processor's
+memory.  (An array type
+.Vt M6502_Memory,
+suitable for defining values to pass as the
+.Fa memory
+argument, is defined in the
+.In lib6502.h
+include file.)
+.It Fa callbacks
+a structure mapping processor memory accesses to client callback
+functions.
+.El
+.Pp
+Access to the contents of the
+.Fa registers
+and
+.Fa memory
+members can be made directly.
+The
+.Fa registers
+member is a
+.Vt M6502_Registers
+containing the following members:
+.Bd -literal
+struct _M6502_Registers
+{
+    uint8_t   a;  /* accumulator */
+    uint8_t   x;  /* X index register */
+    uint8_t   y;  /* Y index register */
+    uint8_t   p;  /* processor status register */
+    uint8_t   s;  /* stack pointer */
+    uint16_t pc;  /* program counter */
+};
+.Ed
+.Pp
+The
+.Fa memory
+member is an array of
+.Vt unsigned char
+and can be indexed directly.  In addition, two convenience macros
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+provide access to the reset and interrupt vectors within
+.Fa memory .
+.Fn M6502_getVector
+returns the address stored in the named
+.Fa vector
+which must be precisely one of the following:
+.Bl  -tag -width ".Dv RST" -offset indent
+.It Dv RST
+the reset vector.
+.It Dv NMI
+the non-maskable interrupt vector.
+.It Dv IRQ
+the interrupt request vector.
+.El
+.Pp
+.Fn M6502_setVector
+stores its
+.Fa address
+argument in the named
+.Fa vector
+and returns the new value.
+.Pp
+The
+.Fa callbacks
+member contains an opaque structure mapping processor memory accesses
+to client callback functions.  Whenever the processor performs an
+access for which a corresponding entry exists in the the
+.Fa callbacks
+structure, the emulator suspends execution and invokes the callback to
+complete the operation.  Each callback function should have a
+signature equivalent to:
+.Bd -ragged -offset indent
+int
+.Va callback
+(M6502 *mpu, uint16_t address, uint8_t data);
+.Ed
+.Pp
+The macros
+.Fn M6502_getCallback
+and
+.Fn M6502_setCallback
+read and write entries in the
+.Fa callbacks
+structure.  These macros identify a unique memory access operation
+from the specified
+.Fa address
+on which it operates and
+.Fa type
+of access involved.  The
+.Fa type
+argument must be one of the following:
+.Bl -tag -width ".Dv write"
+.It Dv read
+the
+.Fa callback
+is invoked when the processor attempts to read from the
+given address.  The emulator passes the effective address of the
+operation to the callback in its
+.Fa address
+argument.  (The
+.Fa data
+argument is undefined.)  The value returned by the callback will be
+used by the emulator as the result of the read operation.
+.It Dv write
+the
+.Fa callback
+is invoked when the processor attempts to write to the
+given address.  The emulator passes the effective address of the
+operation to the callback in its
+.Fa address
+argument and the byte being written in the
+.Fa data
+argument.  The emulator will not perform the write operation before
+invoking the callback; if the write should complete, the callback must
+modify the processor's
+.Fa memory
+explicitly.  The valued returned from the callback is ignored.
+.It Dv call
+the
+.Fa callback
+is invoked when the processor attempts to transfer control to the
+given address by any instruction other than a relative branch.  The
+emulator passes the destination address to the callback in its
+.Fa address
+argument and the instruction that initiated the control transfer in
+its
+.Fa data
+argument (one of JMP, JSR, BRK, RTS or RTI).  If the callback returns
+zero (the callback refuses to handle the operation) the emulator will
+allow the operation to complete as normal.  If the callback returns a
+non-zero address (indicating that the callback has handled the
+operation internally) the emulator will transfer control to that
+address.
+.It Dv illegal_instruction
+the
+.Fa callback
+is invoked when the processor attempts to execute the illegal instruction
+whose opcode is the given "address".  The emulator passes the address of the
+instruction to the callback in its
+.Fa address
+argument and the instruction itself in the
+.Fa data
+argument.  If the callback returns a non-zero address the 
+emulator will transfer control to that address, otherwise execution will 
+continue at the next instruction.
+.El
+.Pp
+.Fn M6502_getCallback
+returns zero if there is no callback associated with the given
+.Fa type
+and
+.Fa address .
+Passing zero as the
+.Fa callback
+argument of
+.Fn M6502_setCallback
+removes any callback that might have been associated with
+.Fa type
+and
+.Fa address .
+.Pp
+.Fn M6502_run
+emulates processor execution in the given
+.Fa mpu
+by repeatedly fetching the instruction addressed by
+.Fa pc
+and dispatching to it.  This function normally never returns.
+.Pp
+.Fn M6502_dump
+writes a (NUL-terminated) symbolic representation of the processor's
+internal state into the supplied
+.Fa buffer .
+Typical output resembles:
+.Bd -literal -offset indent
+PC=1010 SP=01FE A=0A X=5B Y=00 P=D1 NV-B---C
+.Ed
+.Pp
+.Fn M6502_disassemble
+writes a (NUL-terminated) symbolic representation of the instruction
+in the processor's memory at the given
+.Fa address
+into the supplied
+.Fa buffer .
+It returns the size (in bytes) of the instruction.  (In other words,
+the amount by which
+.Fa address
+should be incremented to arrive at the next instruction.)
+Typical output resembles:
+.Bd -literal -offset indent
+1009 cpx #5B
+.Ed
+.Pp
+(The
+.Fa buffer
+arguments are oversized to allow for future expansion.)
+.Pp
+.Fn M6502_delete
+frees the resources associated with the given
+.Fa mpu.
+Any members that were allocated implicitly (passed as NULL to
+.Fn M6502_new )
+are deallocated.  Members that were initialised from non-NULL
+arguments are not deallocated.
+.Pp
+.Fn M6502_setMode
+is a lib6502-jit extension which sets the emulation mode to use for the
+instance to
+.Fa mode ,
+which must be precisely one of the following:
+.Bl  -tag -width ".Dv RST" -offset indent
+.It Dv M6502_ModeInterpreted
+6502 code will be interpreted, much as in lib6502 itself.
+.It Dv M6502_ModeCompiled
+6502 code will always be compiled to host code before executing.  This can result
+in jerky execution as emulation halts during compilation.  Self-modifying code
+will work correctly, but if this happens a lot the repeated re-compilations
+will result in very slow execution.
+.It Dv M6502_ModeHybrid
+6502 code will be compiled to host code but the interpreter will be used to
+continue execution during compilation.  Execution will be smooth and relatively
+fast but performance of repeatedly executed code will vary (in theory, improve)
+over time.  Repeated self-modification by code will cause re-compilations but
+performance will still be reasonable as the interpreter will continue execution;
+the main downside is that CPU will be taken up by the compilation.  (On a
+machine with two or more idle cores, this is wasteful but should not
+significantly harm performance, as one core will run the interpreter while the
+other handles the compilation.)  This is the default mode.
+.El
+.Pp
+.Fa arg
+is the maximum number of 6502 instructions to be compiled into a single unit
+of code when hybrid or compiled mode is selected; it is ignored in interpreted
+mode.  Specifying 0 will give a reasonable default value.
+.Pp
+.\" ----------------------------------------------------------------
+.Sh IMPLEMENTATION NOTES
+.\" 
+You can share the
+.Fa memory
+and
+.Fa callbacks
+members of
+.Vt M6502
+between multiple instances to simulate multiprocessor hardware.
+.\" ----------------------------------------------------------------
+.Sh RETURN VALUES
+.\" 
+.Fn M6502_new
+returns a pointer to a
+.Vt M6502
+structure.
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+return the contents of the given
+.Fa vector .
+.Fn M6502_getCallback
+and
+.Fn M6502_setCallback
+return the
+.Vt M6502_Callback
+function associated with the given
+.Fa address
+and access
+.Fa type .
+.Fn M6502_disassemble
+returns the size (in bytes) of the instruction at the given
+.Fa address .
+.Fn M6502_reset ,
+.Fn M6502_nmi ,
+.Fn M6502_irq ,
+.Fn M6502_run ,
+.Fn M6502_dump,
+.Fn M6502_delete
+and
+.Fn M6502_setMode
+don't return anything (unless you forgot to include
+.In lib6502.h ) .
+.\" ----------------------------------------------------------------
+.Sh EXAMPLES
+.\" 
+The following program creates a 6502 processor, sets up callbacks for
+printing characters and halting after a BRK instruction, stores a
+program into memory that prints the alphabet, disassembles the program
+on stdout, and then executes the program.
+.Bd -literal -offset indent -compact
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "lib6502.h"
+
+#define WRCH    0xFFEE
+
+int wrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  int pc;
+  putchar(mpu->registers->a);
+  pc  = mpu->memory[++mpu->registers->s + 0x100];
+  pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;
+  return pc + 1;  /* JSR pushes next insn addr - 1 */
+}
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump(mpu, buffer);
+  printf("\\nBRK instruction\\n%s\\n", buffer);
+  exit(0);
+}
+
+int main(int argc, char **argv)
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  unsigned  pc  = 0x1000;
+
+  mpu->callbacks->call[WRCH] = wrch;     /* write character */
+  mpu->callbacks->call[0000] = done;     /* reached after BRK */
+
+# define gen1(X)        (mpu->memory[pc++] = (uint8_t)(X))
+# define gen2(X,Y)      gen1(X); gen1(Y)
+# define gen3(X,Y,Z)    gen1(X); gen2(Y,Z)
+
+  gen2(0xA2, 'A'     );  /* LDX #'A'   */
+  gen1(0x8A          );  /* TXA        */
+  gen3(0x20,0xEE,0xFF);  /* JSR FFEE   */
+  gen1(0xE8          );  /* INX        */
+  gen2(0xE0, 'Z'+1   );  /* CPX #'Z'+1 */
+  gen2(0xD0, -9      );  /* BNE 1002   */
+  gen2(0xA9, '\\n'    );  /* LDA #'\\n'  */
+  gen3(0x20,0xEE,0xFF);  /* JSR FFEE   */
+  gen2(0x00,0x00     );  /* BRK        */
+
+  {
+    uint16_t ip = 0x1000;
+    while (ip < pc)
+      {
+        char insn[64];
+        ip += M6502_disassemble(mpu, ip, insn);
+        printf("%04X %s\\n", ip, insn);
+      }
+  }
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);
+
+  return 0;
+}
+.Ed
+.\" ----------------------------------------------------------------
+.Sh DIAGNOSTICS
+.\" 
+If
+.Fn M6502_new
+cannot allocate sufficient memory it prints "out of memory" to stderr
+and exits with a non-zero status.
+.Pp
+If
+.Fn M6502_run
+encounters an illegal or undefined instruction, it prints "undefined
+instruction" and the processor's state to stderr, then exits with a
+non-zero status.
+.\" ----------------------------------------------------------------
+.Sh COMPATIBILITY
+.\" 
+M6502 is a generic name. The initial letter is mandated by C naming
+conventions and chosen in deference to MOS Technology, the original
+designers of the processor.  To the best of my knowledge the 'M'
+prefix was never stamped on a physical 6502.
+.Pp
+The emulator implements the CMOS version of the processor (NMOS bugs
+in effective address calculations involving page boundaries are
+corrected).  lib6502 does not tolerate the execution of undefined
+instructions (which were all no-ops in the first-generation CMOS
+hardware); lib6502-jit treats them as no-ops.  It would be nice to
+support the several alternative instruction sets (model-specific
+undocumented instructions in NMOS models, and various documented
+extensions in the later CMOS models) but there are currently no plans
+to do so.
+.Pp
+The emulated 6502 will run much faster than real hardware on any
+modern computer.  The fastest 6502 hardware available at the time of
+writing has a clock speed of 14 MHz.  On a 2 GHz PowerPC, the emulated
+6502 runs at almost 300 MHz (in interpreted mode).
+.\" ----------------------------------------------------------------
+.Sh SEE ALSO
+.\" 
+.Xr run6502 1
+.Pp
+For development tools, documentation and source code:
+.Pa http://6502.org
+.\" ----------------------------------------------------------------
+.Sh AUTHORS
+.\" 
+The original lib6502 software and manual pages were written by Ian Piumarta.
+Additional changes to create lib6502-jit were made by Steven Flintham.
+.Pp
+The software is provided as-is, with absolutely no warranty, in the
+hope that you will enjoy and benefit from it.  You may use (entirely
+at your own risk) and redistribute it under the terms of a very
+liberal license that does not seek to restrict your rights in any way
+(unlike certain so-called 'open source' licenses that significantly
+limit your freedom in the name of 'free' software that is, ultimately,
+anything but free).  See the file COPYING for details.
+.\" ----------------------------------------------------------------
+.Sh BUGS
+.\" 
+.Fn M6502_getVector
+and
+.Fn M6502_setVector
+evaluate their arguments more than once.
+.Pp
+The out-of-memory condition and attempted execution of
+illegal/undefined instructions should not be fatal errors.
+.Pp
+There is no way to limit the duration of execution within
+.Fn M6502_run
+to a certain number of instructions or cycles.
+.Pp
+The emulator should support some means of implicit interrupt
+generation, either by polling or in response to (Unix) signals.
+.Pp
+The
+.Sx COMPATIBILITY
+section in this manual page has been diverted from its legitimate
+purpose.
+.Pp
+The plural of 'callback' really aught to be 'callsback'.
+.Pp
+Please send bug reports (and feature requests) to :
+lib6502-jit@lemma.co.uk.
diff --git a/man/run6502.1 b/man/run6502.1
new file mode 100644
index 0000000..98f761f
--- /dev/null
+++ b/man/run6502.1
@@ -0,0 +1,396 @@
+.\" Copyright (c) 2005 Ian Piumarta
+.\" Copyright (c) 2014 Steven Flintham
+.\" 
+.\" Permission is hereby granted, free of charge, to any person
+.\" obtaining a copy of this software and associated documentation
+.\" files (the 'Software'), to deal in the Software without
+.\" restriction, including without limitation the rights to use, copy,
+.\" modify, merge, publish, distribute, and/or sell copies of the
+.\" Software, and to permit persons to whom the Software is furnished
+.\" to do so, provided that the above copyright notice(s) and this
+.\" permission notice appear in all copies of the Software and that
+.\" both the above copyright notice(s) and this permission notice
+.\" appear in supporting documentation.
+.\" 
+.\" THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+.\"
+.Dd October 31, 2005
+.Dt RUN6502 1 LOCAL
+.Os ""
+.\" ----------------------------------------------------------------
+.Sh NAME
+.\" 
+.Nm run6502
+.Nd execute a 6502 microprocessor program
+.\" ----------------------------------------------------------------
+.Sh SYNOPSIS
+.\" 
+.Nm run6502
+.Op Ar option ...
+.Nm run6502
+.Op Ar option ...
+.Fl B
+.Op Ar
+.\" ----------------------------------------------------------------
+.Sh DESCRIPTION
+The
+.Nm run6502
+command emulates the execution of a 6502 microprocessor.  It creates a
+memory image from the contents of one or more files on the command
+line and then simulates a power-on hardware reset to begin execution.
+.Pp
+In its first form,
+.Nm run6502
+emulates an embedded 6502 processor with 64 kilobytes of RAM, no
+memory-mapped hardware, and no input-output capabilities.  Limited
+interaction with the machine is possible only through the
+.Fl G , M
+and
+.Fl P
+options.
+.Pp
+In its second form (with the
+.Fl B
+option)
+.Nm run6502
+provides minimal emulation of Acorn 'BBC Model B' hardware with 32
+kilobytes of RAM, 16 kilobytes of paged language ROMs, and 16
+kilobytes of operating system ROM.  A few MOS calls are intercepted to
+provide keyboard input and screen output via stdin and stdout.
+Switching between the sixteen paged read-only memory banks is also
+supported by the usual memory-mapped control register.  Any
+.Ar file
+arguments after the
+.Fl B
+are loaded into successive paged ROM banks (starting at 15 and working
+down towards 0) before execution begins.
+.\" ----------------------------------------------------------------
+.Ss Options
+.\" 
+.Bl -tag -width indent
+.It Fl B
+enable minimal Acorn 'BBC Model B' hardware emulation:
+.Bl -bullet
+.It
+the contents of memory between addresses 0x8000 and 0xBFFF are copied
+into paged ROM number 0;
+.It
+memory between 0x8000 and 0xBFFF becomes bank-switchable between
+sixteen different ROM images;
+.It
+the memory-mapped pages ('FRED', 'JIM' and 'SHEILA') between 0xFC00
+and 0xFEFF are initialised to harmless values;
+.It
+the upper half of the address space is write-protected; and
+.It
+callbacks are installed on several OS entry points to provide
+input-output via stdin and stdout.
+.El
+.Pp
+Any remaining non-option arguments on the command line will name files
+to be loaded successively into paged ROMs, starting at 15 and working
+downwards towards 0.
+.It Fl d Ar addr Ar end
+dump memory from the address
+.Ar addr
+(given in hexadecimal) up to (but not including)
+.Ar end .
+The
+.Ar end
+argument is either an absolute address or a relative address specified
+as a '+' character followed by the number (in hexadecimal) of bytes to
+dump.  In other words, the following two options dump the same region
+of memory:
+.Bd -ragged -offset indent
+.Fl d
+8000  C000
+.Ed
+.Bd -ragged -offset indent -compact
+.Fl d
+8000 +4000
+.Ed
+.Pp
+The format of the dump cannot currently be modified and consists of
+the current address followed by one, two or three hexadecimal bytes,
+and a symbolic representation of the instruction at that address.
+.It Fl G Ar addr
+arrange that subroutine calls to
+.Ar addr
+will behave as if there were an implementation of
+.Xr getchar 3
+at that address, reading a character from stdin and returning it in
+the accumulator.
+.It Fl h
+print a summary of the available options and then exit.
+.It Fl I Ar addr
+set the IRQ (interrupt request) vector (the address to which the
+processor will transfer control upon execution of a BRK instruction).
+Setting this address to zero will cause execution to halt (and the
+emulator to exit) when a BRK instruction is encountered.
+.It Fl i Ar addr Ar file
+Load
+.Ar file
+into the memory image at the address
+.Ar addr
+(in hexadecimal), skipping over any initial '#!' interpreter line.
+.It Fl l Ar addr Ar file
+Load
+.Ar file
+into the memory image at the address
+.Ar addr
+(in hexadecimal).
+.It Fl M Ar addrio
+arrange that memory reads from address
+.Ar addrio
+will return the next character on stdin (blocking if necessary), and
+memory writes to
+.Ar addrio
+will send the value written to stdout.
+.It Fl mc
+use compiled emulation mode.  All code is compiled into host machine
+code.  This can make the emulation very jerky as execution halts
+while compiling.
+.It Fl mh
+use hybrid emulation mode.  Code is compiled into
+host machine code, but while this is happening an interpreter allows
+execution to continue.  This is the default mode.
+.It Fl mi
+use interpreted emulation mode. All code is interpreted.
+.It Fl mx Ar count
+in compiled and hybrid emulation modes, set the maximum number of
+6502 instructions which are translated as a unit to
+.Ar count .
+This has no effect in interpreted mode. A reasonable default is
+chosen if this is not specified.
+.It Fl N Ar addr
+set the NMI (non-maskable interrupt) vector to
+.Ar addr .
+.It Fl P Ar addr
+arrange that subroutine calls to
+.Ar addr
+will behave as if there were an implementation of
+.Xr putchar 3
+at that address, writing the contents of the accumulator to stdout.
+.It Fl R Ar addr
+set the RST (hardware reset) vector.  The processor will transfer
+control to this address when emulated execution begins.
+.It Fl s Ar addr Ar end Ar file
+save the contents of memory from the address
+.Ar addr
+up to
+.Ar end
+(exclusive) to the given
+.Ar file .
+As with the
+.Fl d
+option,
+.Ar end
+can be absolute or '+' followed by a byte count.
+.It Fl v
+print version information and then exit.
+.It Fl X Ar addr
+arrange that any transfer of control to the address
+.Ar addr
+will cause an immediate exit with zero exit status.
+.It Fl x
+exit immediately.  (Useful after
+.Fl d
+or when
+.Nm run6502
+is being used as a trivial 'image editor', with several
+.Fl l
+options followed by
+.Fl s
+and
+.Fl x . )
+.It Ar
+following a
+.Fl B
+option, load one or more ROM image
+files
+into successive paged ROM slots.  Other than the paging aspect, this
+is equivalent to:
+.Bd -ragged -offset indent
+.Fl l Ar 8000 Ar image
+.Ed
+.El
+.\" ----------------------------------------------------------------
+.Sh EXAMPLES
+.\" 
+.Ss A Very Simple Program
+The
+.Xr perl 1
+command can be used to create a binary file from hexadecimal input:
+.Bd -literal
+    echo a2418a20eeffe8e05bd0f7a90a20eeff00 |
+    perl -e 'print pack "H*",<STDIN>' > temp.img
+.Ed
+.Pp
+The file can be loaded and executed with:
+.Bd -literal
+    run6502 -l 1000 temp.img -R 1000 -P FFEE -X 0
+.Ed
+.Pp
+The contents of the file can be inspected symbolically with:
+.Bd -literal
+    run6502 -l 1000 temp.img -d 1000 +12
+.Ed
+.Pp
+The options passed to
+.Nm run6502
+in the above examples have the following effects:
+.Bl -tag -width offset
+.It \-l 1000 temp.img
+loads the file
+.Pa temp.img
+into memory at address 0x8000.
+.It \-R 1000
+sets the reset vector (the address of first instruction to be executed
+after 'power on') to 0x1000.
+.It \-P FFEE
+arranges for calls to address 0xFFEE to behave as if there were an
+implementation of
+.Xr putchar 3
+at that address.
+.It \-X 0
+arranges for transfers of control to address 0 to exit from the
+emulator.  This works in the above example because the final 'BRK'
+instruction causes an implicit subroutine call through an
+uninitialised interrupt vector to location 0.  To see this
+instruction...
+.It \-d 1000 +12
+disassembles 18 bytes of memory at address 0x8000.
+.El
+.Ss Standalone Images
+The
+.Fl i
+option is designed for use in the 'interpreter command' appearing on
+the first line of an executable script.  Adding the line
+.Bd -literal
+    #!run6502 -R 1000 -P FFEE -X 0 -i 1000
+.Ed
+.Pp
+(with no leading spaces and a single trailing newline character)
+to the
+.Pa temp.img
+file from the first example turns it into a script.  If the file is
+made executable with
+.Bd -literal
+    chmod +x temp.img
+.Ed
+.Pp
+it can be run like a standalone program:
+.Bd -literal
+    ./temp.img
+.Ed
+.Ss A Very Complex Program
+Consider a pair of files named
+.Pa os1.2
+and
+.Pa basic2
+containing (legally-acquired, of course) ROM images of Acorn MOS 1.2
+and BBC Basic 2.  The following command loads each of the images into
+memory at the appropriate address, cleans up the regions of memory
+containing memory-mapped i/o on the BBC computer, saves a snapshot of
+the entire memory to the file
+.Pa image 
+and then exits:
+.Bd -literal
+    run6502 -l C000 os1.2 -l 8000 basic2 -B -s0 +10000 image -x
+.Ed
+.Pp
+Running the generated image with
+.Bd -literal
+    run6502 image
+.Ed
+.Pp
+will cold-start the emulated hardware, run the OS for a while, and
+then drop into the language ROM.  Basic programs can then be entered,
+edited and run from the terminal.
+.Pp
+More details are given in the
+.Pa README
+file available in the
+.Pa examples
+directory of the distribution.
+.Ss Exercises
+Create a standalone image (one that can be run as a program, with
+a '#!' interpreter line at the beginning) that contains Basic2 and
+OS1.2 (as described above).  This image should be no larger than 32K
+(memory below 0x8000, which would be full of zeroes, should not appear
+in the image file).
+.\" ----------------------------------------------------------------
+.Sh DIAGNOSTICS
+.\" 
+If nothing goes wrong, none.  Otherwise lots.  They should be
+self-explanatory.  I'm too lazy to enumerate them.
+.\" ----------------------------------------------------------------
+.Sh COMPATIBILITY
+.\" 
+See
+.Xr lib6502 3
+for a discussion of the emulated instruction set.
+.\" ----------------------------------------------------------------
+.Sh SEE ALSO
+.\" 
+.Xr lib6502 3
+.Pp
+The file
+.Pa examples/README
+in the lib6502 distribution.  (Depending on your system this may be
+installed in
+.Pa /usr/doc/lib6502 ,
+.Pa /usr/local/doc/lib6502 ,
+.Pa /usr/share/doc/lib6502 ,
+or similar.)
+.Pp
+.Pa http://piumarta.com/software/lib6502
+for updates and documentation to lib6502.
+.Pp
+.Pa https://github.com/ZornsLemma/lib6502-jit
+for updates and documentation to lib6502-jit.
+.Pp
+.Pa http://6502.org
+for lots of 6502-related resources.
+.\" ----------------------------------------------------------------
+.Sh AUTHORS
+.\" 
+The original lib6502 software and manual pages were written by Ian Piumarta.
+Additional changes to create lib6502-jit were made by Steven Flintham.
+.Pp
+The software is provided as-is, with absolutely no warranty, in the
+hope that you will enjoy and benefit from it.  You may use (entirely
+at your own risk) and redistribute it under the terms of a very
+liberal license that does not seek to restrict your rights in any way
+(unlike certain so-called 'open source' licenses that significantly
+limit your freedom in the name of 'free' software that is, ultimately,
+anything but free).  See the file COPYING for details.
+.\" ----------------------------------------------------------------
+.Sh BUGS
+.\" 
+.Bl -bullet
+.It
+Options must appear one at a time.
+.It
+Any attempt (in a load or save operation) to transfer data beyond
+0xFFFF is silently truncated at the end of memory.
+.It
+There is no way to specify the slot into which a ROM image should be
+loaded, other than implicitly according to the order of arguments on
+the command line.
+.It
+Execution can only be started via the emulated power-up reset.  There
+is no support for 'warm-starting' execution in an image at an
+arbitrary address.
+.It
+Even though the emulator fully supports them, there is no way to
+artificially generate a hardware interrupt request, non-maskable
+interrupt, or reset condition.  If you need these, read
+.Xr lib6502 3
+and write your own shell.
+.It
+The Acorn 'BBC Model B' hardware emulation is totally lame.
+.El
+.Pp
+Please send bug reports (and feature requests) to :
+lib6502-jit@lemma.co.uk.
diff --git a/run6502.c b/run6502.c
new file mode 100644
index 0000000..2e3731a
--- /dev/null
+++ b/run6502.c
@@ -0,0 +1,599 @@
+/* run6502.c -- 6502 emulator shell			-*- C -*- */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* Last edited: 2005-11-02 01:18:58 by piumarta on margaux.local
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/wait.h>
+#include <time.h>
+
+#include "config.h"
+#include "lib6502.h"
+
+#undef VERSION
+#define VERSION	PACKAGE_NAME " " PACKAGE_VERSION " " PACKAGE_COPYRIGHT
+
+typedef uint8_t  byte;
+typedef uint16_t word;
+
+static char *program= 0;
+
+static M6502_Mode mode= M6502_ModeHybrid;
+static int max_insns= 0; /* default */
+
+static byte bank[0x10][0x4000];
+
+static uint64_t system_time_base;
+
+
+void fail(const char *fmt, ...)
+{
+  va_list ap;
+  fflush(stdout);
+  va_start(ap, fmt);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+  exit(1);
+}
+
+
+void pfail(const char *msg)
+{
+  fflush(stdout);
+  perror(msg);
+  exit(1);
+}
+
+
+#define rts							\
+  {								\
+    word pc;							\
+    pc  = mpu->memory[++mpu->registers->s + 0x100];		\
+    pc |= mpu->memory[++mpu->registers->s + 0x100] << 8;	\
+    return pc + 1;						\
+  }
+
+
+uint64_t pseudo_system_time(void)
+{
+  struct timespec t;
+  if (clock_gettime(CLOCK_MONOTONIC, &t) == -1)
+  {
+    pfail("clock_gettime() failed");
+  }
+  long hsec= t.tv_nsec / 10000000;
+  return (((uint64_t) t.tv_sec) * 100) + hsec;
+}
+
+int osword(M6502 *mpu, word address, byte data)
+{
+  byte *params= mpu->memory + mpu->registers->x + (mpu->registers->y << 8);
+
+  switch (mpu->registers->a)
+    {
+    case 0x00: /* input line */
+      /* On entry: XY+0,1=>string area,
+       *	   XY+2=maximum line length,
+       *	   XY+3=minimum acceptable ASCII value,
+       *	   XY+4=maximum acceptable ASCII value.
+       * On exit:  Y is the line length (excluding CR),
+       *	   C is set if Escape terminated input.
+       */
+      {
+	word  offset= params[0] + (params[1] << 8);
+	byte *buffer= mpu->memory + offset;
+	byte  length= params[2], minVal= params[3], maxVal= params[4], b= 0;
+	if (!fgets((char *) buffer, length, stdin))
+	  {
+	    putchar('\n');
+	    exit(0);
+	  }
+	for (b= 0;  b < length;  ++b)
+	  if ((buffer[b] < minVal) || (buffer[b] > maxVal) || ('\n' == buffer[b]))
+	    break;
+	buffer[b]= 13;
+	mpu->registers->y= b;
+	mpu->registers->p &= 0xFE;
+	break;
+      }
+
+    case 0x01: /* read system time */
+      /* On exit: XY+0..4=>5 byte time in hundredths of a second
+       */
+      {
+	uint64_t system_time= pseudo_system_time() - system_time_base;
+	int i;
+	for (i= 0;  i < 5;  ++i)
+	{
+	  params[i]= system_time & 0xFF;
+	  system_time>>= 8;
+	}
+	break;
+      }
+
+    case 0x05: /* read I/O processor memory */
+      /* On entry: XY+0..3=>address to read from
+       * On exit:  XY+4   =>the byte read
+       */
+    {
+	word addr= params[0] + (params[1] << 8);
+	params[4]= mpu->memory[addr];
+	break;
+    }
+
+    default:
+      {
+	char state[64];
+	M6502_dump(mpu, state);
+	fflush(stdout);
+	fprintf(stderr, "\nOSWORD %s\n", state);
+	fail("ABORT");
+      }
+      break;
+    }
+  
+  rts;
+}
+
+
+int osbyte(M6502 *mpu, word address, byte data)
+{
+  switch (mpu->registers->a)
+    {
+    case 0x7A:	/* perform keyboard scan */
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x7E:	/* acknowledge detection of escape condition */
+      return 1;
+      break;
+
+    case 0x82:	/* read machine higher order address */
+      mpu->registers->y= 0x00;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x83:	/* read top of OS ram address (OSHWM) */
+      mpu->registers->y= 0x0E;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x84:	/* read bottom of display ram address */
+      mpu->registers->y= 0x80;
+      mpu->registers->x= 0x00;
+      break;
+
+    case 0x89:	/* motor control */
+      break;
+
+    case 0xDA:	/* read/write number of items in vdu queue (stored at 0x026A) */
+      return 0;
+      break;
+
+    default:
+      {
+	char state[64];
+	M6502_dump(mpu, state);
+	fflush(stdout);
+	fprintf(stderr, "\nOSBYTE %s\n", state);
+	fail("ABORT");
+      }
+      break;
+    }
+
+  rts;
+}
+
+
+int oscli(M6502 *mpu, word address, byte data)
+{
+  byte *params= mpu->memory + mpu->registers->x + (mpu->registers->y << 8);
+  char  command[1024], *ptr= command;
+  int   ret;
+  while (('*' == *params) || (' ' == *params))
+    ++params;
+  while (13 != *params)
+    *ptr++= *params++;
+  *ptr= '\0';
+  ret= system(command);
+  if ((ret == -1) || (WIFEXITED(ret) && (WEXITSTATUS(ret) == 127)))
+    {
+      fflush(stdout);
+      fprintf(stderr, "\nsystem() failed\n");
+    }
+  rts;
+}
+
+
+int oswrch(M6502 *mpu, word address, byte data)
+{
+  switch (mpu->registers->a)
+    {
+    case 0x0C:
+      fputs("\033[2J\033[H", stdout);
+      break;
+
+    default:
+      putchar(mpu->registers->a);
+      break;
+    }
+  fflush(stdout);
+  rts;
+}
+
+
+static int writeROM(M6502 *mpu, word address, byte value)
+{
+  return 0;
+}
+
+
+static int bankSelect(M6502 *mpu, word address, byte value)
+{
+  memcpy(mpu->memory + 0x8000, bank[value & 0x0F], 0x4000);
+  return 0;
+}
+
+
+static int doBtraps(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+
+  /* Acorn Model B ROM and memory-mapped IO */
+
+  for (addr= 0x8000;  addr <= 0xFBFF;  ++addr)  mpu->callbacks->write[addr]= writeROM;
+  for (addr= 0xFC00;  addr <= 0xFEFF;  ++addr)  mpu->memory[addr]= 0xFF;
+  for (addr= 0xFE30;  addr <= 0xFE33;  ++addr)  mpu->callbacks->write[addr]= bankSelect;
+  for (addr= 0xFE40;  addr <= 0xFE4F;  ++addr)  mpu->memory[addr]= 0x00;
+  for (addr= 0xFF00;  addr <= 0xFFFF;  ++addr)  mpu->callbacks->write[addr]= writeROM;
+
+  /* anything already loaded at 0x8000 appears in bank 0 */
+
+  memcpy(bank[0x00], mpu->memory + 0x8000, 0x4000);
+
+  /* fake a few interesting OS calls */
+
+# define trap(vec, addr, func)   mpu->callbacks->call[addr]= (func)
+  trap(0x020C, 0xFFF1, osword);
+  trap(0x020A, 0xFFF4, osbyte);
+//trap(0x0208, 0xFFF7, oscli );	/* enable this to send '*COMMAND's to system(3) :-) */
+  trap(0x020E, 0xFFEE, oswrch);
+  trap(0x020E, 0xE0A4, oswrch);	/* NVWRCH */
+#undef trap
+
+  system_time_base= pseudo_system_time();
+
+  return 0;
+}
+
+
+static void usage(int status)
+{
+  FILE *stream= status ? stderr : stdout;
+  fprintf(stream, VERSION"\n");
+  fprintf(stream, "please send bug reports to: %s\n", PACKAGE_BUGREPORT);
+  fprintf(stream, "\n");
+  fprintf(stream, "usage: %s [option ...]\n", program);
+  fprintf(stream, "       %s [option ...] -B [image ...]\n", program);
+  fprintf(stream, "  -B                -- minimal Acorn 'BBC Model B' compatibility\n");
+  fprintf(stream, "  -d addr last      -- dump memory between addr and last\n");
+  fprintf(stream, "  -G addr           -- emulate getchar(3) at addr\n");
+  fprintf(stream, "  -h                -- help (print this message)\n");
+  fprintf(stream, "  -I addr           -- set IRQ vector\n");
+  fprintf(stream, "  -l addr file      -- load file at addr\n");
+  fprintf(stream, "  -M addr           -- emulate memory-mapped stdio at addr\n");
+  fprintf(stream, "  -mc               -- use compiled emulation mode\n");
+  fprintf(stream, "  -mh               -- use hybrid emulation mode (default)\n");
+  fprintf(stream, "  -mi               -- use interpreted emulation mode\n");
+  fprintf(stream, "  -mx count         -- maximum instructions to JIT (-mc/-mh)\n");
+  fprintf(stream, "  -N addr           -- set NMI vector\n");
+  fprintf(stream, "  -P addr           -- emulate putchar(3) at addr\n");
+  fprintf(stream, "  -R addr           -- set RST vector\n");
+  fprintf(stream, "  -s addr last file -- save memory from addr to last in file\n");
+  fprintf(stream, "  -v                -- print version number then exit\n");
+  fprintf(stream, "  -X addr           -- terminate emulation if PC reaches addr\n");
+  fprintf(stream, "  -x                -- exit without further ado\n");
+  fprintf(stream, "  image             -- '-l 8000 image' in available ROM slot\n");
+  fprintf(stream, "\n");
+  fprintf(stream, "'last' can be an address (non-inclusive) or '+size' (in bytes)\n");
+  exit(status);
+}
+
+
+static int doHelp(int argc, char **argv, M6502 *mpu)
+{
+  usage(0);
+  return 0;
+}
+
+
+static int doVersion(int argc, char **argv, M6502 *mpu)
+{
+  puts(VERSION);
+  exit(0);
+  return 0;
+}
+
+
+static unsigned long htol(char *hex)
+{
+  char *end;
+  unsigned long l= strtol(hex, &end, 16);
+  if (*end) fail("bad hex number: %s", hex);
+  return l;
+}
+
+
+static int loadInterpreter(M6502 *mpu, word start, const char *path)
+{
+  FILE   *file= 0;
+  int     count= 0;
+  byte   *memory= mpu->memory + start;
+  size_t  max= 0x10000 - start;
+  int     c= 0;
+
+  if ((!(file= fopen(path, "r"))) || ('#' != fgetc(file)) || ('!' != fgetc(file)))
+    return 0;
+  while ((c= fgetc(file)) >= ' ')
+    ;
+  while ((count= fread(memory, 1, max, file)) > 0)
+    {
+      memory += count;
+      max -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int save(M6502 *mpu, word address, unsigned length, const char *path)
+{
+  FILE *file= 0;
+  int   count= 0;
+  if (!(file= fopen(path, "w")))
+    return 0;
+  while ((count= fwrite(mpu->memory + address, 1, length, file)))
+    {
+      address += count;
+      length -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int load(M6502 *mpu, word address, const char *path)
+{
+  FILE  *file= 0;
+  int    count= 0;
+  size_t max= 0x10000 - address;
+  if (!(file= fopen(path, "r")))
+    return 0;
+  while ((count= fread(mpu->memory + address, 1, max, file)) > 0)
+    {
+      address += count;
+      max -= count;
+    }
+  fclose(file);
+  return 1;
+}
+
+
+static int doLoadInterpreter(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 3) usage(1);
+  if (!loadInterpreter(mpu, htol(argv[1]), argv[2])) pfail(argv[2]);
+  return 2;
+}
+
+
+static int doLoad(int argc, char **argv, M6502 *mpu)	/* -l addr file */
+{
+  if (argc < 3) usage(1);
+  if (!load(mpu, htol(argv[1]), argv[2])) pfail(argv[2]);
+  return 2;
+}
+
+
+static int doSave(int argc, char **argv, M6502 *mpu)	/* -l addr size file */
+{
+  if (argc < 4) usage(1);
+  if (!save(mpu, htol(argv[1]), htol(argv[2]), argv[3])) pfail(argv[3]);
+  return 3;
+}
+
+
+static int doMode(M6502_Mode m)
+{
+  mode= m;
+  return 0;
+}
+
+
+static int doMaxInsns(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 2) usage(1);
+  char *end;
+  unsigned long l= strtol(argv[1], &end, 10);
+  if (*end) fail("bad number: %s", argv[1]);
+  max_insns= l;
+  return 1;
+}
+
+
+#define doVEC(VEC)					\
+  static int do##VEC(int argc, char **argv, M6502 *mpu)	\
+    {							\
+      unsigned addr= 0;					\
+      if (argc < 2) usage(1);				\
+      addr= htol(argv[1]);				\
+      M6502_setVector(mpu, VEC, addr);			\
+      return 1;						\
+    }
+
+doVEC(IRQ);
+doVEC(NMI);
+doVEC(RST);
+
+#undef doVEC
+
+
+static int gTrap(M6502 *mpu, word addr, byte data)	{ mpu->registers->a= getchar();  rts; }
+static int pTrap(M6502 *mpu, word addr, byte data)	{ putchar(mpu->registers->a);  rts; }
+
+static int doGtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, gTrap);
+  return 1;
+}
+
+static int doPtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, pTrap);
+  return 1;
+}
+
+
+static int mTrapRead(M6502 *mpu, word addr, byte data)	{ return getchar(); }
+static int mTrapWrite(M6502 *mpu, word addr, byte data)	{ return putchar(data); }
+
+static int doMtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, read,  addr, mTrapRead);
+  M6502_setCallback(mpu, write, addr, mTrapWrite);
+  return 1;
+}
+
+
+static int xTrap(M6502 *mpu, word addr, byte data)	{ exit(0);  return 0; }
+
+static int doXtrap(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0;
+  if (argc < 2) usage(1);
+  addr= htol(argv[1]);
+  M6502_setCallback(mpu, call, addr, xTrap);
+  return 1;
+}
+
+
+static int doDisassemble(int argc, char **argv, M6502 *mpu)
+{
+  unsigned addr= 0, last= 0;
+  if (argc < 3) usage(1);
+  addr= htol(argv[1]);
+  last= ('+' == *argv[2]) ? addr + htol(1 + argv[2]) : htol(argv[2]);
+  while (addr < last)
+    {
+      char insn[64];
+      int  i= 0, size= M6502_disassemble(mpu, addr, insn);
+      printf("%04X ", addr);
+      while (i++ < size)  printf("%02X", mpu->memory[addr + i - 1]);
+      while (i++ < 4)     printf("  ");
+      putchar(' ');
+      i= 0;
+      while (i++ < size)  putchar(isgraph(mpu->memory[addr + i - 1]) ? mpu->memory[addr + i - 1] : ' ');
+      while (i++ < 4)     putchar(' ');
+      printf(" %s\n", insn);
+      addr += size;
+    }
+  return 2;
+}
+
+
+int main(int argc, char **argv)
+{
+  M6502 *mpu= M6502_new(0, 0, 0);
+  int bTraps= 0;
+
+  program= argv[0];
+
+  if ((2 == argc) && ('-' != *argv[1]))
+    {
+      if ((!loadInterpreter(mpu, 0, argv[1])) && (!load(mpu, 0, argv[1])))
+	pfail(argv[1]);
+      doBtraps(0, 0, mpu);
+    }
+  else
+    while (++argv, --argc > 0)
+      {
+	int n= 0;
+	if      (!strcmp(*argv, "-B"))  bTraps= 1;
+	else if (!strcmp(*argv, "-d"))	n= doDisassemble(argc, argv, mpu);
+	else if (!strcmp(*argv, "-G"))	n= doGtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-h"))	n= doHelp(argc, argv, mpu);
+	else if (!strcmp(*argv, "-i"))	n= doLoadInterpreter(argc, argv, mpu);
+	else if (!strcmp(*argv, "-I"))	n= doIRQ(argc, argv, mpu);
+	else if (!strcmp(*argv, "-l"))	n= doLoad(argc, argv, mpu);
+	else if (!strcmp(*argv, "-M"))	n= doMtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-mc")) n= doMode(M6502_ModeCompiled);
+	else if (!strcmp(*argv, "-mh")) n= doMode(M6502_ModeHybrid);
+	else if (!strcmp(*argv, "-mi")) n= doMode(M6502_ModeInterpreted);
+	else if (!strcmp(*argv, "-mx")) n= doMaxInsns(argc, argv, mpu);
+	else if (!strcmp(*argv, "-N"))	n= doNMI(argc, argv, mpu);
+	else if (!strcmp(*argv, "-P"))	n= doPtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-R"))	n= doRST(argc, argv, mpu);
+	else if (!strcmp(*argv, "-s"))	n= doSave(argc, argv, mpu);
+	else if (!strcmp(*argv, "-v"))	n= doVersion(argc, argv, mpu);
+	else if (!strcmp(*argv, "-X"))	n= doXtrap(argc, argv, mpu);
+	else if (!strcmp(*argv, "-x"))	exit(0);
+	else if ('-' == **argv)		usage(1);
+	else
+	  {
+	    /* doBtraps() left 0x8000+0x4000 in bank 0, so load */
+	    /* additional images starting at 15 and work down */
+	    static int bankSel= 0x0F;
+	    if (!bTraps)			usage(1);
+	    if (bankSel < 0)			fail("too many images");
+	    if (!load(mpu, 0x8000, argv[0]))	pfail(argv[0]);
+	    memcpy(bank[bankSel--],
+		   0x8000 + mpu->memory,
+		   0x4000);
+	    n= 0;
+	  }
+	argc -= n;
+	argv += n;
+      }
+
+  M6502_setMode(mpu, mode, max_insns);
+
+  if (bTraps)
+    doBtraps(0, 0, mpu);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);
+
+  return 0;
+}
diff --git a/test/addr-wrap-1.mst b/test/addr-wrap-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/addr-wrap-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/addr-wrap-1.xa b/test/addr-wrap-1.xa
new file mode 100644
index 0000000..a49e9d4
--- /dev/null
+++ b/test/addr-wrap-1.xa
@@ -0,0 +1,25 @@
+#include "config.xa"
+
+	LDA #1
+	STA $00
+	STA $05
+	STA $0A
+	LDY #$80
+	CLC
+	LDA #0
+LOOP
+	ADC $FF80,Y
+	INY
+	BNE LOOP
+	CMP #3
+	BNE FAIL
+
+SUCCESS
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/basic-callback.c b/test/basic-callback.c
new file mode 100644
index 0000000..d2ffb27
--- /dev/null
+++ b/test/basic-callback.c
@@ -0,0 +1,122 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int rd(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nrd: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int wr(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nwr: address %04X opcode %02X\n%s\n", address, data, buffer);
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,                     0, done);
+  M6502_setCallback(mpu, call,                0x2000, call);
+  M6502_setCallback(mpu, call,                0x3000, call);
+  M6502_setCallback(mpu, call,                0x4000, call);
+  M6502_setCallback(mpu, read,                0x5000, rd  );
+  M6502_setCallback(mpu, write,               0x5000, wr  );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x44, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x5c, ill );
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x13          );
+  gen1(0x44          );
+  gen1(0x13          ); // not executed, 0x44 is a two-byte illegal instruction
+  gen1(0x5C          );
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0xad,0x00,0x50); // LDA &5000
+  gen2(0x64,0x70     ); // STZ &70
+  gen2(0xa9,0x50     ); // LDA #&50
+  gen2(0x85,0x71     ); // STA &71
+  gen2(0xb2,0x70     ); // LDA (&70)
+  gen2(0x92,0x70     ); // STA (&70)
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen3(0x8d,0x00,0x50); // STA &5000
+  gen3(0x4c,0x00,0x30); // JMP &3000
+
+  pc = 0x3000;
+  gen2(0xa9,0x00     ); // LDA #0
+  gen3(0x8d,0x76,0x32); // STA &3276
+  gen2(0xa9,0x40     ); // LDA #&40
+  gen3(0x8d,0x77,0x32); // STA &3277
+  gen3(0x6c,0x76,0x32); // JMP (&3276)
+
+  pc = 0x4000;
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/basic-callback.mst b/test/basic-callback.mst
new file mode 100644
index 0000000..2c713d3
--- /dev/null
+++ b/test/basic-callback.mst
@@ -0,0 +1,33 @@
+
+ill: address 1000 opcode 13 memory 13
+PC=1001 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+ill: address 1001 opcode 44 memory 44
+PC=1003 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+ill: address 1003 opcode 5C memory 5C
+PC=1006 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+call: address 2000 opcode 20
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+wr: address 5000 opcode 00
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+call: address 3000 opcode 4C
+PC=3000 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+call: address 4000 opcode 6C
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+rd: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+rd: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+wr: address 5000 opcode 00
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+BRK instruction: address 1016 opcode 00
+PC=1018 SP=01FD A=00 X=00 Y=00 P=06 -----IZ-
diff --git a/test/call-illegal-callback-modify-code.c b/test/call-illegal-callback-modify-code.c
new file mode 100644
index 0000000..bf5ec76
--- /dev/null
+++ b/test/call-illegal-callback-modify-code.c
@@ -0,0 +1,121 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+static uint16_t call_modify1_addr;
+static uint16_t call_modify2_addr;
+static uint16_t ill_modify1_addr;
+static uint16_t ill_modify2_addr;
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  mpu->memory[call_modify1_addr] += 1;
+  mpu->memory[call_modify2_addr] += 2;
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  mpu->memory[ill_modify1_addr] += 1;
+  mpu->memory[ill_modify2_addr] += 2;
+  return 0;
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,                     0, done  );
+  M6502_setCallback(mpu, call,                0x2000, call  );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill   );
+  M6502_setCallback(mpu, call,                0xffee, oswrch);
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen1(0x13          ); // ill &13
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen1(0x13          ); // ill &13
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0x20,0x00,0x30); // JSR &3000
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen1(0x60          ); // RTS
+
+  pc = 0x3000;
+  gen2(0xa9,'C'      ); // LDA #'C'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  call_modify1_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  call_modify2_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  ill_modify1_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  ill_modify2_addr = pc + 1;
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  gen2(0xa9,'\n'     ); // LDA #'\n'
+  gen3(0x20,0xee,0xff); // JSR &FFEE
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/call-illegal-callback-modify-code.mst b/test/call-illegal-callback-modify-code.mst
new file mode 100644
index 0000000..cc5acff
--- /dev/null
+++ b/test/call-illegal-callback-modify-code.mst
@@ -0,0 +1,20 @@
+CAAAA
+
+ill: address 1003 opcode 13 memory 13
+PC=1004 SP=0100 A=0A X=00 Y=00 P=04 -----I--
+CAABC
+
+ill: address 1007 opcode 13 memory 13
+PC=1008 SP=0100 A=0A X=00 Y=00 P=04 -----I--
+CAACE
+
+call: address 2000 opcode 20
+PC=100E SP=01FE A=0A X=00 Y=00 P=04 -----I--
+CBCCE
+
+call: address 2000 opcode 20
+PC=1014 SP=01FE A=0A X=00 Y=00 P=04 -----I--
+CCECE
+
+BRK instruction: address 1017 opcode 00
+PC=1019 SP=01FD A=0A X=00 Y=00 P=04 -----I--
diff --git a/test/config.xa b/test/config.xa
new file mode 100644
index 0000000..a7e0560
--- /dev/null
+++ b/test/config.xa
@@ -0,0 +1,4 @@
+OSWRCH = $FFEE
+QUIT = $F000
+
+*= $1E00
diff --git a/test/interleave.mst b/test/interleave.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/interleave.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/interleave.xa b/test/interleave.xa
new file mode 100644
index 0000000..8fb5ee0
--- /dev/null
+++ b/test/interleave.xa
@@ -0,0 +1,38 @@
+#include "config.xa"
+
+	JSR SETX10
+	CPX #10
+	BNE FAIL
+	JSR SETX30
+	CPX #30
+	BNE FAIL
+	JSR SETX20
+	CPX #20
+	BNE FAIL
+	JSR SETX30
+	CPX #30
+	BNE FAIL
+	JSR SETX10
+	CPX #10
+	BNE FAIL
+	JSR SETX20
+	CPX #20
+	BNE FAIL
+
+SUCCESS
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
+
+; example taken from http://www.6502.org/tutorials/6502opcodes.html
+SETX10	LDX #10
+	.byte $2C
+SETX20	LDX #20
+	.byte $2C
+SETX30	LDX #30
+	RTS
diff --git a/test/irq-nmi.c b/test/irq-nmi.c
new file mode 100644
index 0000000..ae95352
--- /dev/null
+++ b/test/irq-nmi.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int brk(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  if (data == 0x03)
+  {
+    M6502_nmi(mpu);
+  }
+  else if (data == 0x13)
+  {
+    M6502_irq(mpu);
+  } 
+
+  return 0;
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  /* 0x3000 is the IRQ/BRK vector, but call callbacks don't trigger on
+   * interrupts, so this is only called on BRK.
+   */
+  M6502_setCallback(mpu, call,                0x3000, brk   );
+
+  M6502_setCallback(mpu, illegal_instruction,   0x03, ill   );
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill   );
+  M6502_setCallback(mpu, call,                0xffee, oswrch);
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x58          ); // CLI
+  gen2(0xa9,'A'      ); // LDA #'A'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x03          ); // NMI
+  gen2(0xa9,'B'      ); // LDA #'B'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x13          ); // IRQ
+  gen2(0xa9,'C'      ); // LDA #'C'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x78          ); // SEI
+  gen1(0x13          ); // IRQ (ignored)
+  gen1(0x03          ); // NMI
+  gen1(0x13          ); // IRQ (ignored)
+  gen2(0xa9,'D'      ); // LDA #'D'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x58          ); // CLI
+  gen1(0x13          ); // IRQ
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen2(0xa9,'N'      ); // LDA #'N'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x40          ); // RTI
+
+  pc = 0x3000;
+  gen2(0xa9,'I'      ); // LDA #'I'
+  gen3(0x20,0xee,0xff); // JSR &ffee
+  gen1(0x40          ); // RTI
+
+  M6502_setVector(mpu, RST, 0x1000);
+  M6502_setVector(mpu, NMI, 0x2000);
+  M6502_setVector(mpu, IRQ, 0x3000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/irq-nmi.mst b/test/irq-nmi.mst
new file mode 100644
index 0000000..bf7d32b
--- /dev/null
+++ b/test/irq-nmi.mst
@@ -0,0 +1,21 @@
+A
+ill: address 1006 opcode 03 memory 03
+PC=1007 SP=0100 A=41 X=00 Y=00 P=00 --------
+NB
+ill: address 100C opcode 13 memory 13
+PC=100D SP=0100 A=42 X=00 Y=00 P=00 --------
+IC
+ill: address 1013 opcode 13 memory 13
+PC=1014 SP=0100 A=43 X=00 Y=00 P=04 -----I--
+
+ill: address 1014 opcode 03 memory 03
+PC=1015 SP=0100 A=43 X=00 Y=00 P=04 -----I--
+N
+ill: address 1015 opcode 13 memory 13
+PC=1016 SP=0100 A=4E X=00 Y=00 P=04 -----I--
+D
+ill: address 101C opcode 13 memory 13
+PC=101D SP=0100 A=44 X=00 Y=00 P=00 --------
+I
+BRK: address 101D opcode 00
+PC=101F SP=01FD A=49 X=00 Y=00 P=04 -----I--
diff --git a/test/pc-wrap-1.mst b/test/pc-wrap-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/pc-wrap-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/pc-wrap-1.xa b/test/pc-wrap-1.xa
new file mode 100644
index 0000000..c703803
--- /dev/null
+++ b/test/pc-wrap-1.xa
@@ -0,0 +1,28 @@
+#include "config.xa"
+
+; It's not important this is self-modifying code, this is just the easiest way
+; to get code at the relevant addresses without fighting with the assembler and
+; the fact run6502 will clobber the top of memory to set up various vectors.
+
+	LDA #$A9 ; LDA #n
+	STA $FFFE
+	STA $00
+	LDA #'N'
+	STA $FFFF
+	LDA #'Y'
+	STA $01
+
+	LDA #$20 ; JSR abs
+	STA $02
+	LDA #$EE
+	STA $03
+	LDA #$FF
+	STA $04
+	LDA #$4C ; JMP abs
+	STA $05
+	LDA #<QUIT
+	STA $06
+	LDA #>QUIT
+	STA $07
+
+	JMP $FFFE
diff --git a/test/pc-wrap-2.mst b/test/pc-wrap-2.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/pc-wrap-2.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/pc-wrap-2.xa b/test/pc-wrap-2.xa
new file mode 100644
index 0000000..c70763f
--- /dev/null
+++ b/test/pc-wrap-2.xa
@@ -0,0 +1,28 @@
+#include "config.xa"
+
+; It's not important this is self-modifying code, this is just the easiest way
+; to get code at the relevant addresses without fighting with the assembler and
+; the fact run6502 will clobber the top of memory to set up various vectors.
+
+	LDA #$A9 ; LDA #n
+	STA $FFFD
+	STA $FFFF
+	LDA #'N'
+	STA $FFFE
+	LDA #'Y'
+	STA $00
+
+	LDA #$20 ; JSR abs
+	STA $01
+	LDA #$EE
+	STA $02
+	LDA #$FF
+	STA $03
+	LDA #$4C ; JMP abs
+	STA $04
+	LDA #<QUIT
+	STA $05
+	LDA #>QUIT
+	STA $06
+
+	JMP $FFFD
diff --git a/test/run-c-tests.py b/test/run-c-tests.py
new file mode 100755
index 0000000..b4a628e
--- /dev/null
+++ b/test/run-c-tests.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+import subprocess
+
+tests = [
+    'basic-callback',
+    'call-illegal-callback-modify-code',
+    'irq-nmi',
+    'setjmp-trick',
+    'stack-code-brk',
+    'stack-code-jsr',
+    'write-callback-modify-code'
+]
+
+test_args = [
+    '-mi',
+    '-mh',
+    '-mc -mx 1',
+    '-mc'
+]
+        
+print('1..', len(tests) * len(test_args), sep='')
+i = 1
+for test_arg in test_args:
+    for test in tests:
+        result = subprocess.check_output(['test/' + test] + test_arg.split())
+        expected_result = open('test/' + test + '.mst', 'rb').read()
+        if result == expected_result:
+            print('ok', i, test, test_arg)
+        else:
+            print('not ok', i, test, test_arg)
+        i += 1
diff --git a/test/run-c-tests.sh b/test/run-c-tests.sh
new file mode 100755
index 0000000..7c60f3c
--- /dev/null
+++ b/test/run-c-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+python test/run-c-tests.py
diff --git a/test/run-run6502-tests.py b/test/run-run6502-tests.py
new file mode 100755
index 0000000..378989e
--- /dev/null
+++ b/test/run-run6502-tests.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+import glob
+import os
+import subprocess
+
+os.chdir('test')
+
+# It's quite likely the "xa" assembler is not installed; don't generate
+# scary test failures if that's the case.
+xa_installed = True
+try:
+    result = subprocess.check_output(['xa', '--version'])
+    if result.find(b'xa65') == -1:
+        xa_installed = False
+except:
+    xa_installed = False
+
+# By default we skip slow tests (those with names starting z-) in '-mc'
+# modes.
+skip_slow_mc = (os.getenv('RUN_SLOW_TESTS', '0') == '0')
+
+# Since we didn't have to hard-code the test names in the Makefile.am, we
+# use wildcards here.
+tests = sorted([t for t in glob.glob('*.xa') if t != 'config.xa'])
+
+test_args = [
+    '-mi',
+    '-mh',
+    '-mc -mx 1',
+    '-mc'
+]
+        
+print('1..', len(tests) * len(test_args), sep='')
+i = 0
+for test_arg in test_args:
+    for test in tests:
+        i += 1
+        basename = test[0:-3]
+
+        if not xa_installed:
+            print('ok', i, '# skipped (xa not installed):', test, test_arg)
+            continue
+
+        if skip_slow_mc and basename[0:2] == 'z-' and test_arg[0:3] == '-mc':
+            print('ok', i, '# skipped (slow -mc):', test, test_arg)
+            continue
+
+        xa_out = basename + '.mc'
+        subprocess.check_call(['xa', '-o', xa_out, test])
+        result = subprocess.check_output(
+            ['../run6502', '-l', '1e00', xa_out, '-R', '1e00', '-G', 'ffe0', 
+             '-P', 'ffee', '-X', 'f000'] + test_arg.split())
+        expected_result = open(basename + '.mst', 'rb').read()
+        if result == expected_result:
+            print('ok', i, test, test_arg)
+        else:
+            print('not ok', i, test, test_arg)
diff --git a/test/run-run6502-tests.sh b/test/run-run6502-tests.sh
new file mode 100755
index 0000000..c0e21dd
--- /dev/null
+++ b/test/run-run6502-tests.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+python test/run-run6502-tests.py
diff --git a/test/setjmp-trick.c b/test/setjmp-trick.c
new file mode 100644
index 0000000..f363d2e
--- /dev/null
+++ b/test/setjmp-trick.c
@@ -0,0 +1,125 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+static jmp_buf env;
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  longjmp(env, 1);
+  exit(0);
+}
+
+int call(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\ncall: address %04X opcode %02X\n%s\n", address, data, buffer);
+  mpu->registers->pc = address;
+  longjmp(env, 2);
+  return 0;
+}
+
+int ill(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nill: address %04X opcode %02X memory %02X\n%s\n", address, data, mpu->memory[address], buffer);
+  longjmp(env, 3);
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  /* Read and write callbacks don't provide the correct, up-to-date CPU state
+   * in the M6502 object, so this trick is a non-starter with them.
+   */
+
+  M6502_setCallback(mpu, call,                     0, done);
+  M6502_setCallback(mpu, call,                0x2000, call);
+  M6502_setCallback(mpu, call,                0x3000, call);
+  M6502_setCallback(mpu, call,                0x4000, call);
+  M6502_setCallback(mpu, illegal_instruction,   0x13, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x44, ill );
+  M6502_setCallback(mpu, illegal_instruction,   0x5c, ill );
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+  gen1(0x13          );
+  gen1(0x44          );
+  gen1(0x13          ); // not executed, 0x44 is a two-byte illegal instruction
+  gen1(0x5C          );
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen1(0x13          ); // not executed, 0x5C is a two-byte illegal instruction
+  gen3(0x20,0x00,0x20); // JSR &2000
+  gen3(0xad,0x00,0x50); // LDA &5000
+  gen2(0x00,0x00     ); // BRK
+
+  pc = 0x2000;
+  gen3(0x8d,0x00,0x50); // STA &5000
+  gen3(0x4c,0x00,0x30); // JMP &3000
+
+  pc = 0x3000;
+  gen2(0xa9,0x00     ); // LDA #0
+  gen3(0x8d,0x76,0x32); // STA &3276
+  gen2(0xa9,0x40     ); // LDA #&40
+  gen3(0x8d,0x77,0x32); // STA &3277
+  gen3(0x6c,0x76,0x32); // JMP (&3276)
+
+  pc = 0x4000;
+  gen1(0x60          ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  while (1)
+  {
+    volatile int result = setjmp(env);
+    if (result == 0)
+    {
+    	M6502_run(mpu);
+    }
+    else
+    {
+      printf("\nsetjmp() returned %d\n", result);
+      if (result == 1)
+      {
+	break;
+      }
+    }
+  }
+  M6502_delete(mpu);
+
+  return 0;
+}
diff --git a/test/setjmp-trick.mst b/test/setjmp-trick.mst
new file mode 100644
index 0000000..ac0bcd5
--- /dev/null
+++ b/test/setjmp-trick.mst
@@ -0,0 +1,35 @@
+
+ill: address 1000 opcode 13 memory 13
+PC=1001 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+ill: address 1001 opcode 44 memory 44
+PC=1003 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+ill: address 1003 opcode 5C memory 5C
+PC=1006 SP=0100 A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 3
+
+call: address 2000 opcode 20
+PC=1009 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+call: address 3000 opcode 4C
+PC=3000 SP=01FE A=00 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+call: address 4000 opcode 6C
+PC=4000 SP=01FE A=40 X=00 Y=00 P=04 -----I--
+
+setjmp() returned 2
+
+BRK instruction: address 100C opcode 00
+PC=100E SP=01FD A=00 X=00 Y=00 P=06 -----IZ-
+
+setjmp() returned 1
diff --git a/test/stack-code-brk.c b/test/stack-code-brk.c
new file mode 100644
index 0000000..8ac2b75
--- /dev/null
+++ b/test/stack-code-brk.c
@@ -0,0 +1,108 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+  unsigned saved_pc;
+
+  M6502_setCallback(mpu, call,  0xf000, done  );
+  M6502_setCallback(mpu, call,  0xffee, oswrch);
+
+  gen2(0xa2, 0xff      ); // LDX #&FF
+  gen1(0x9a            ); // TXS
+  gen2(0xa9, 'A'       ); // LDA #'A'
+
+  // LDA #'B' is 0xa9, 0x42. So if we execute a BRK at 0x42a7, it will
+  // push 0x42, 0xa9 and the flags onto the stack. Since the stack grows
+  // downwards those bytes will be in the right order for execution. We'll
+  // additionally push an LDX immediate opcode so we can "execute" the flags
+  // value. We can nearly force the flags to be whatever we like using PLP,
+  // although the BRK will set the B and X bits in the stacked value. We
+  // demonstrate this by explicitly masking off those bits in the values we
+  // force into the flags.
+  enum {
+    flagX= (1<<5),	/* unused   	 */
+    flagB= (1<<4) 	/* irq from brk  */
+  };
+  uint8_t mask = ~(flagX | flagB);
+  gen2(0xa0, '0' & mask); // LDY #('0' with B/X masked off)
+  gen1(0x5a            ); // PHY
+  gen1(0x28            ); // PLP
+  gen3(0x4c, 0xa7, 0x42); // JMP &42A7
+  pc = 0x42a7;
+  gen2(0x00, 0x00      ); // BRK
+  saved_pc = pc;
+  pc = 0x0; // BRK vector
+  gen2(0xa9, 0xa2      ); // LDA #<LDX # opcode>
+  gen1(0x48            ); // PHA
+  gen3(0x4c, 0xfc, 0x01); // JMP &01FC
+  pc = 0x200;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x8a            ); // TXA
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x68            ); // PLA
+  gen1(0x40            ); // RTI
+  pc = saved_pc;
+
+  // Let's do the same thing again, but this time code has already been
+  // executed from that address on the stack, so we're verifying the change
+  // is picked up. We do LDA #'C' this time, so we execute the BRK from
+  // 0x43a7.
+  gen2(0xa0, '1' & mask); // LDY #('1' with B/X masked off)
+  gen1(0x5a            ); // PHY
+  gen1(0x28            ); // PLP
+  gen3(0x4c, 0xa7, 0x43); // JMP &43A7
+  pc = 0x43a7;
+  gen2(0x00, 0x00      ); // BRK
+
+  gen3(0x4c, 0x00, 0xf0); // JMP &F000 (quit)
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/stack-code-brk.mst b/test/stack-code-brk.mst
new file mode 100644
index 0000000..467dbb8
--- /dev/null
+++ b/test/stack-code-brk.mst
@@ -0,0 +1 @@
+B0C1
\ No newline at end of file
diff --git a/test/stack-code-jsr.c b/test/stack-code-jsr.c
new file mode 100644
index 0000000..5cac6bf
--- /dev/null
+++ b/test/stack-code-jsr.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+  unsigned saved_pc;
+
+  M6502_setCallback(mpu, call,       0, done  );
+  M6502_setCallback(mpu, call,  0xffee, oswrch);
+
+  gen2(0xa2, 0xff      ); // LDX #&FF
+  gen1(0x9a            ); // TXS
+  gen2(0xa9, 'A'       ); // LDA #'A'
+
+  // LDA #'B' is 0xa9, 0x42. So if we execute a JSR at 0x42a7, it will
+  // push 0x42 and then 0xa9 onto the stack. Since the stack grows downwards
+  // those bytes will be in the right order for execution.
+  gen3(0x4c, 0xa7, 0x42); // JMP &42A7
+  pc = 0x42a7;
+  gen3(0x20, 0x00, 0x30); // JSR &3000
+  saved_pc = pc;
+  pc = 0x3000;
+  gen3(0x4c, 0xfe, 0x01); // JMP &01FE
+  pc = 0x200;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x60            ); // RTS
+  pc = saved_pc;
+
+  // Let's do the same thing again, but this time code has already been
+  // executed from that address on the stack, so we're verifying the change
+  // is picked up. We do LDA #'C' this time, so we execute the JSR from
+  // 0x43a7.
+  gen3(0x4c, 0xa7, 0x43); // JMP &43A7
+  pc = 0x43a7;
+  gen3(0x20, 0x00, 0x30); // JSR &3000
+
+  gen2(0x00, 0x00      ); // BRK
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/stack-code-jsr.mst b/test/stack-code-jsr.mst
new file mode 100644
index 0000000..62ee1a3
--- /dev/null
+++ b/test/stack-code-jsr.mst
@@ -0,0 +1,3 @@
+BC
+BRK instruction: address 43AA opcode 00
+PC=43AC SP=01FC A=43 X=FF Y=00 P=04 -----I--
diff --git a/test/test-utils.c b/test/test-utils.c
new file mode 100644
index 0000000..b17819c
--- /dev/null
+++ b/test/test-utils.c
@@ -0,0 +1,106 @@
+/* parse-args.c -- utility function for C test programs */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+/* Some of this code is copy-and-pasted from run6502.c, but there's not enough
+ * of it for me to want to complicate things even slightly by trying to share
+ * it, especially since this is test code and somewhat distinct. 
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lib6502.h"
+
+static const char *program= 0;
+static M6502_Mode mode= M6502_ModeHybrid;
+static int max_insns= 0; /* default */
+
+enum {
+  flagX= (1<<5),	/* unused   	 */
+  flagB= (1<<4) 	/* irq from brk  */
+};
+
+void fail(const char *fmt, ...)
+{
+  va_list ap;
+  fflush(stdout);
+  va_start(ap, fmt);
+  vfprintf(stderr, fmt, ap);
+  va_end(ap);
+  fprintf(stderr, "\n");
+  exit(1);
+}
+
+static void usage(int status)
+{
+  FILE *stream = stderr;
+  fprintf(stream, "usage: %s [option ...]\n", program);
+  fprintf(stream, "  -h        -- help (print this message)\n");
+  fprintf(stream, "  -mc       -- use compiled emulation mode\n");
+  fprintf(stream, "  -mh       -- use hybrid emulation mode (default)\n");
+  fprintf(stream, "  -mi       -- use interpreted emulation mode\n");
+  fprintf(stream, "  -mx count -- maximum instructions to JIT (-mc/-mh)\n");
+  exit(status);
+}
+
+static int doMode(M6502_Mode m)
+{
+  mode= m;
+  return 0;
+}
+
+static int doMaxInsns(int argc, char **argv, M6502 *mpu)
+{
+  if (argc < 2) usage(1);
+  char *end;
+  unsigned long l= strtol(argv[1], &end, 10);
+  if (*end) fail("bad number: %s", argv[1]);
+  max_insns= l;
+  return 1;
+}
+
+void parse_args(int argc, char *argv[], M6502 *mpu)
+{
+    program= argv[0];
+    while (++argv, --argc > 0)
+    {
+	int n= 0;
+	if      (!strcmp(*argv, "-h"))  usage(0);
+	else if (!strcmp(*argv, "-mc")) n= doMode(M6502_ModeCompiled);
+	else if (!strcmp(*argv, "-mh")) n= doMode(M6502_ModeHybrid);
+	else if (!strcmp(*argv, "-mi")) n= doMode(M6502_ModeInterpreted);
+	else if (!strcmp(*argv, "-mx")) n= doMaxInsns(argc, argv, mpu);
+	else				usage(1);
+	argc -= n;
+	argv += n;
+    }
+
+    M6502_setMode(mpu, mode, max_insns);
+}
+
+void M6502_dump_masked(M6502 *mpu, char buffer[64])
+{
+    uint8_t orig_p = mpu->registers->p;
+    mpu->registers->p &= ~(flagB | flagX);
+    M6502_dump(mpu, buffer);
+    mpu->registers->p = orig_p;
+}
diff --git a/test/test-utils.h b/test/test-utils.h
new file mode 100644
index 0000000..5b15dd7
--- /dev/null
+++ b/test/test-utils.h
@@ -0,0 +1,30 @@
+/* test-utils.h -- utility functions for C test programs */
+
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef PARSEARGS_H
+#define PARSEARGS_H
+
+#include "lib6502.h"
+
+void parse_args(int argc, char *argv[], M6502 *mpu);
+
+void M6502_dump_masked(M6502 *mpu, char buffer[64]);
+
+#endif
diff --git a/test/trivial-test.mst b/test/trivial-test.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/trivial-test.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/trivial-test.xa b/test/trivial-test.xa
new file mode 100644
index 0000000..1448a22
--- /dev/null
+++ b/test/trivial-test.xa
@@ -0,0 +1,5 @@
+#include "config.xa"
+
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/write-callback-modify-code.c b/test/write-callback-modify-code.c
new file mode 100644
index 0000000..cb35317
--- /dev/null
+++ b/test/write-callback-modify-code.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2005 Ian Piumarta
+ * Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib6502.h"
+#include "test-utils.h"
+
+int done(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  char buffer[64];
+  M6502_dump_masked(mpu, buffer);
+  printf("\nBRK instruction: address %04X opcode %02X\n%s\n", address, data, buffer);
+  exit(0);
+}
+
+int oswrch(M6502 *mpu, uint16_t address, uint8_t data)
+{
+  putchar(mpu->registers->a);
+  mpu->memory[0xffee] = 0x60; // RTS
+  return 0;
+}
+
+# define gen1(X)	(mpu->memory[pc++]= (uint8_t)(X))
+# define gen2(X,Y)	gen1(X); gen1(Y)
+# define gen3(X,Y,Z)	gen1(X); gen2(Y,Z)
+
+int wr(M6502 *mpu, uint16_t address, uint8_t data)
+{
+    if (address != 0x42)
+    {
+    	abort();
+    }
+
+    unsigned pc = 0x6000;
+    gen2(0xa9, data);       // LDA #data
+    gen3(0x4c, 0x00, 0x20); // JMP &2000
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  M6502    *mpu = M6502_new(0, 0, 0);
+  parse_args(argc, argv, mpu);
+
+  unsigned  pc  = 0x1000;
+
+  M6502_setCallback(mpu, call,      0, done);
+  M6502_setCallback(mpu, call, 0xffee, oswrch);
+  M6502_setCallback(mpu, write,  0x42, wr  );
+
+  gen2(0xa9, '>'       ); // LDA #'>'
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen2(0xa2, 'A'       ); // LDX #'A'
+  gen3(0x8e, 0x42, 0x00); // STX &0042
+  gen3(0x20, 0x00, 0x60); // JSR &6000
+  gen1(0xe8            ); // INX
+  gen2(0xe0, 'Z'+1     ); // CPX #('Z'+1)
+  gen2(0x90, 0xf5      ); // BCC to STX
+
+  gen2(0xa0, 0x05      ); // LDY #&05
+  gen2(0xa9, '>'       ); // LDA #'>'
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen2(0xa2, 'A'       ); // LDX #'A'
+  gen2(0x96, 0x42-0x05 ); // STX (&42-&05),Y
+  gen3(0x20, 0x00, 0x60); // JSR &6000
+  gen1(0xe8            ); // INX
+  gen2(0xe0, 'Z'+1     ); // CPX #('Z'+1)
+  gen2(0x90, 0xf6      ); // BCC to STX
+
+  gen2(0x00, 0x00      ); // BRK
+
+  pc = 0x2000;
+  gen3(0x20, 0xee, 0xff); // JSR &FFEE
+  gen1(0x60            ); // RTS
+
+  M6502_setVector(mpu, RST, 0x1000);
+
+  M6502_reset(mpu);
+  M6502_run(mpu);
+  M6502_delete(mpu);	/* We never reach here, but what the hey. */
+
+  return 0;
+}
diff --git a/test/write-callback-modify-code.mst b/test/write-callback-modify-code.mst
new file mode 100644
index 0000000..65de187
--- /dev/null
+++ b/test/write-callback-modify-code.mst
@@ -0,0 +1,3 @@
+>ABCDEFGHIJKLMNOPQRSTUVWXYZ>ABCDEFGHIJKLMNOPQRSTUVWXYZ
+BRK instruction: address 1025 opcode 00
+PC=1027 SP=01FD A=5A X=5B Y=05 P=07 -----IZC
diff --git a/test/z-self-modify-1.mst b/test/z-self-modify-1.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/z-self-modify-1.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/z-self-modify-1.xa b/test/z-self-modify-1.xa
new file mode 100644
index 0000000..cdf31e2
--- /dev/null
+++ b/test/z-self-modify-1.xa
@@ -0,0 +1,94 @@
+; This test attempts to confirm that in hybrid mode, the JITted code is
+; discarded correctly if it's modified by the interpreter.
+
+#include "config.xa"
+
+COUNT1 = $71
+COUNT2 = $72
+COUNT3 = $73
+
+; We loop lots to get as much chance of a problem occurring as possible.
+	STZ COUNT1
+LOOP1
+	LDY #0
+LOOP2
+	LDX #0
+LOOP3
+
+; The heart of the test. We LDA #n, then CMP <address of n>. If the two don't
+; match we have a problem.
+LDAOP
+	LDA #3
+	CMP LDAOP+1
+	BNE FAIL
+
+; We now modify the LDA operand...
+	INC LDAOP+1
+
+; ... and occupy as much of the interpreter's time as possible while the JIT
+; thread picks up the modified version (if it's not working from the snapshot).
+; In reality we probably go round multiple times before the JIT completes.
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+
+; And round and round we go.
+	DEX
+	BNE LOOP3
+	DEY
+	BNE LOOP2
+	DEC COUNT1
+	BNE LOOP1
+
+OK
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/test/z-self-modify-2.mst b/test/z-self-modify-2.mst
new file mode 100644
index 0000000..24de910
--- /dev/null
+++ b/test/z-self-modify-2.mst
@@ -0,0 +1 @@
+Y
\ No newline at end of file
diff --git a/test/z-self-modify-2.xa b/test/z-self-modify-2.xa
new file mode 100644
index 0000000..81d21c4
--- /dev/null
+++ b/test/z-self-modify-2.xa
@@ -0,0 +1,125 @@
+; This test attempts to confirm that as subtle potential bug in the hybrid JIT
+; implementation is not present.
+;
+; The potential problem is as follows:
+; - we decide to JIT some code
+; - we take a snapshot of memory
+; - we kick off a JIT thread which *works off the main memory array*, not the 
+;   snapshot
+; - in the meantime the interpreter executes some code which modifies the code
+;   being JITted before it is actually jitted.
+; - we JIT the modified version of the code
+; - the interpreter then executes some code which reverts the change (A)
+; - we decide to execute the JITted function. We check memory against the memory
+;   snapshot taken when we started JITting and find no differences in any 
+;   addresses which contain code, because of the previous step marked (A).
+; - boom, our JITted code is not doing what it should.
+;
+; The fix for this problem is simply to ensure that the JIT thread works off
+; the snapshot of memory taken when we launched the JIT thread. Note that even
+; if we fail to do this, self-modifying code which doesn't "undo" itself will
+; be noticed when we use the memory snapshot to decide if the JITted code is
+; still valid.
+;
+; This test case should execute correctly in all modes (of course), but in
+; hybrid mode it should *fail* if the implementation is temporarily changed to
+; JIT from mpu->memory and not memory_snapshot. At the time of writing it does.
+
+
+
+#include "config.xa"
+
+COUNT1 = $71
+COUNT2 = $72
+COUNT3 = $73
+
+; We loop lots to get as much chance of a problem occurring as possible.
+	STZ COUNT1
+LOOP1
+	LDY #0
+LOOP2
+	LDX #0
+LOOP3
+
+; The heart of the test. We LDA #n, then CMP <address of n>. If the two don't
+; match we have a problem.
+LDAOP
+	LDA #3
+	CMP LDAOP+1
+	BNE FAIL
+
+; We now modify the LDA operand...
+	INC LDAOP+1
+
+; ... and occupy as much of the interpreter's time as possible while the JIT
+; thread picks up the modified version (if it's not working from the snapshot).
+; In reality we probably go round multiple times before the JIT completes.
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+	NOP
+
+; We now put the operand back. Since we only switch from interpreting to JITting
+; on a control transfer, we know the transition will occur at a point when we've
+; put the operand back, which is helpful.
+	DEC LDAOP+1
+
+; And round and round we go.
+	DEX
+	BNE LOOP3
+	DEY
+	BNE LOOP2
+	DEC COUNT1
+	BNE LOOP1
+
+OK
+	LDA #'Y'
+	JSR OSWRCH
+	JMP QUIT
+FAIL
+	LDA #'N'
+	JSR OSWRCH
+	JMP QUIT
diff --git a/util.cpp b/util.cpp
new file mode 100644
index 0000000..dbcecab
--- /dev/null
+++ b/util.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#include "util.h"
+
+#include <boost/thread/thread.hpp>
+#include <iostream>
+#include <stdio.h>
+
+boost::mutex log_mutex;
+
+void log(const std::string &s)
+{
+    boost::mutex::scoped_lock scoped_lock(log_mutex);
+    std::cerr << s << std::endl;
+}
+
+void die(const char *s)
+{
+  fflush(stdout);
+  fprintf(stderr, "\n%s\n", s);
+  abort();
+}
+
+std::string spaces(int n)
+{
+    return std::string(4 * n, ' ');
+}
+
+std::string apply_prefix(const std::string &prefix, const std::string &s)
+{
+    std::string result = prefix;
+    for (std::string::size_type i = 0; i < s.length(); ++i)
+    {
+        result += s[i];
+        if ((s[i] == '\n') && ((i + 1) < s.length()))
+        {
+            result.append(prefix);
+        }
+    }
+    return result;
+}
+
diff --git a/util.h b/util.h
new file mode 100644
index 0000000..c7967c6
--- /dev/null
+++ b/util.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2014 Steven Flintham
+ * 
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the 'Software'),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, provided that the above copyright notice(s) and this
+ * permission notice appear in all copies of the Software and that both the
+ * above copyright notice(s) and this permission notice appear in supporting
+ * documentation.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS'.  USE ENTIRELY AT YOUR OWN RISK.
+ */
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <assert.h>
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+
+#define CANT_HAPPEN(s) \
+    do { \
+        std::stringstream stream; \
+        stream << __FILE__ << ":" << __LINE__ << ":" << s; \
+        throw std::runtime_error(stream.str()); \
+    } \
+    while (false)
+
+#ifdef LOG
+    #define TRACE(s) \
+        do { \
+            std::stringstream prefix; \
+            prefix << __FILE__ << ":" << __LINE__ << "\t" <<  \
+                      boost::this_thread::get_id() << "\t"; \
+            std::stringstream message; \
+            message << s; \
+            log(apply_prefix(prefix.str(), message.str())); \
+        } \
+        while (false)
+#else
+    #define TRACE(s) \
+        do { \
+        } \
+        while (false)
+#endif
+
+// Avoid spurious "unused variable" warnings from regular assert().
+#ifndef NDEBUG
+    #define ASSERT_EQUAL(x, y) assert((x) == (y))
+#else
+    #define ASSERT_EQUAL(x, y) \
+        do { \
+            x = x; \
+        } \
+        while (0);
+#endif
+
+extern boost::mutex log_mutex;
+void log(const std::string &s);
+void die(const char *s);
+
+std::string spaces(int n);
+std::string apply_prefix(const std::string &prefix, const std::string &s);
+
+#endif
diff --git a/valgrind.h b/valgrind.h
new file mode 100644
index 0000000..222a58e
--- /dev/null
+++ b/valgrind.h
@@ -0,0 +1,4060 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2011 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+
+/* ------------------------------------------------------------------ */
+/* VERSION NUMBER OF VALGRIND                                         */
+/* ------------------------------------------------------------------ */
+
+/* Specify Valgrind's version number, so that user code can
+   conditionally compile based on our version number.  Note that these
+   were introduced at version 3.6 and so do not exist in version 3.5
+   or earlier.  The recommended way to use them to check for "version
+   X.Y or later" is (eg)
+
+#if defined(__VALGRIND_MAJOR__) && defined(__VALGRIND_MINOR__)   \
+    && (__VALGRIND_MAJOR__ > 3                                   \
+        || (__VALGRIND_MAJOR__ == 3 && __VALGRIND_MINOR__ >= 6))
+*/
+#define __VALGRIND_MAJOR__    3
+#define __VALGRIND_MINOR__    6
+
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is).
+
+   Misc note: how to find out what's predefined in gcc by default:
+   gcc -Wp,-dM somefile.c
+*/
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+
+
+#if defined(__APPLE__) && defined(__i386__)
+#  define PLAT_x86_darwin 1
+#elif defined(__APPLE__) && defined(__x86_64__)
+#  define PLAT_amd64_darwin 1
+#elif defined(__MINGW32__) || defined(__CYGWIN32__) \
+      || (defined(_WIN32) && defined(_M_IX86))
+#  define PLAT_x86_win32 1
+#elif defined(__linux__) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif defined(__linux__) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(__linux__) && defined(__arm__)
+#  define PLAT_arm_linux 1
+#elif defined(__linux__) && defined(__s390__) && defined(__s390x__)
+#  define PLAT_s390x_linux 1
+#else
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+/*
+ * VALGRIND_DO_CLIENT_REQUEST(): a statement that invokes a Valgrind client
+ * request. Accepts both pointers and integers as arguments.
+ *
+ * VALGRIND_DO_CLIENT_REQUEST_STMT(): a statement that invokes a Valgrind
+ * client request that does not return a value.
+
+ * VALGRIND_DO_CLIENT_REQUEST_EXPR(): a C expression that invokes a Valgrind
+ * client request and whose value equals the client request result.  Accepts
+ * both pointers and integers as arguments.  Note that such calls are not
+ * necessarily pure functions -- they may have side effects.
+ */
+
+#define VALGRIND_DO_CLIENT_REQUEST(_zzq_rlval, _zzq_default,            \
+                                   _zzq_request, _zzq_arg1, _zzq_arg2,  \
+                                   _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  do { (_zzq_rlval) = VALGRIND_DO_CLIENT_REQUEST_EXPR((_zzq_default),   \
+                        (_zzq_request), (_zzq_arg1), (_zzq_arg2),       \
+                        (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#define VALGRIND_DO_CLIENT_REQUEST_STMT(_zzq_request, _zzq_arg1,        \
+                           _zzq_arg2,  _zzq_arg3, _zzq_arg4, _zzq_arg5) \
+  do { (void) VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                        \
+                    (_zzq_request), (_zzq_arg1), (_zzq_arg2),           \
+                    (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+      (_zzq_default)
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-{linux,darwin} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)  \
+    ||  (defined(PLAT_x86_win32) && defined(__GNUC__))
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                   \
+  ({volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || (PLAT_x86_win32 && __GNUC__) */
+
+/* ------------------------- x86-Win32 ------------------------- */
+
+#if defined(PLAT_x86_win32) && !defined(__GNUC__)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#if defined(_MSC_VER)
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     __asm rol edi, 3  __asm rol edi, 13          \
+                     __asm rol edi, 29 __asm rol edi, 19
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    valgrind_do_client_request_expr((uintptr_t)(_zzq_default),    \
+        (uintptr_t)(_zzq_request), (uintptr_t)(_zzq_arg1),        \
+        (uintptr_t)(_zzq_arg2), (uintptr_t)(_zzq_arg3),           \
+        (uintptr_t)(_zzq_arg4), (uintptr_t)(_zzq_arg5))
+
+static __inline uintptr_t
+valgrind_do_client_request_expr(uintptr_t _zzq_default, uintptr_t _zzq_request,
+                                uintptr_t _zzq_arg1, uintptr_t _zzq_arg2,
+                                uintptr_t _zzq_arg3, uintptr_t _zzq_arg4,
+                                uintptr_t _zzq_arg5)
+{
+    volatile uintptr_t _zzq_args[6];
+    volatile unsigned int _zzq_result;
+    _zzq_args[0] = (uintptr_t)(_zzq_request);
+    _zzq_args[1] = (uintptr_t)(_zzq_arg1);
+    _zzq_args[2] = (uintptr_t)(_zzq_arg2);
+    _zzq_args[3] = (uintptr_t)(_zzq_arg3);
+    _zzq_args[4] = (uintptr_t)(_zzq_arg4);
+    _zzq_args[5] = (uintptr_t)(_zzq_arg5);
+    __asm { __asm lea eax, _zzq_args __asm mov edx, _zzq_default
+            __SPECIAL_INSTRUCTION_PREAMBLE
+            /* %EDX = client_request ( %EAX ) */
+            __asm xchg ebx,ebx
+            __asm mov _zzq_result, edx
+    }
+    return _zzq_result;
+}
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                        \
+            /* %EAX = guest_NRADDR */                             \
+            __asm xchg ecx,ecx                                    \
+            __asm mov __addr, eax                                 \
+    }                                                             \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX ERROR
+
+#else
+#error Unsupported compiler.
+#endif
+
+#endif /* PLAT_x86_win32 */
+
+/* ------------------------ amd64-{linux,darwin} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    __extension__                                                 \
+    ({ volatile unsigned long long int _zzq_args[6];              \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+    __extension__                                                 \
+  ({         unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long long int  _zzq_args[6];                \
+             unsigned long long int  _zzq_result;                 \
+             unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long long int __addr;                                \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "mov r12, r12, ror #3  ; mov r12, r12, ror #13 \n\t"  \
+            "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned int  _zzq_args[6];                          \
+    volatile unsigned int  _zzq_result;                           \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("mov r3, %1\n\t" /*default*/                 \
+                     "mov r4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = client_request ( R4 ) */             \
+                     "orr r10, r10, r10\n\t"                      \
+                     "mov %0, r3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "cc","memory", "r3", "r4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = guest_NRADDR */                      \
+                     "orr r11, r11, r11\n\t"                      \
+                     "mov %0, r3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R4 */        \
+                     "orr r12, r12, r12\n\t"
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ s390x-linux ------------------------ */
+
+#if defined(PLAT_s390x_linux)
+
+typedef
+  struct {
+     unsigned long long int nraddr; /* where's the code? */
+  }
+  OrigFn;
+
+/* __SPECIAL_INSTRUCTION_PREAMBLE will be used to identify Valgrind specific
+ * code. This detection is implemented in platform specific toIR.c
+ * (e.g. VEX/priv/guest_s390_decoder.c).
+ */
+#define __SPECIAL_INSTRUCTION_PREAMBLE                           \
+                     "lr 15,15\n\t"                              \
+                     "lr 1,1\n\t"                                \
+                     "lr 2,2\n\t"                                \
+                     "lr 3,3\n\t"
+
+#define __CLIENT_REQUEST_CODE "lr 2,2\n\t"
+#define __GET_NR_CONTEXT_CODE "lr 3,3\n\t"
+#define __CALL_NO_REDIR_CODE  "lr 4,4\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                         \
+       _zzq_default, _zzq_request,                               \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                  \
+ ({volatile unsigned long long int _zzq_args[6];                 \
+   volatile unsigned long long int _zzq_result;                  \
+   _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+   _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+   _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+   _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+   _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+   _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+   __asm__ volatile(/* r2 = args */                              \
+                    "lgr 2,%1\n\t"                               \
+                    /* r3 = default */                           \
+                    "lgr 3,%2\n\t"                               \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CLIENT_REQUEST_CODE                        \
+                    /* results = r3 */                           \
+                    "lgr %0, 3\n\t"                              \
+                    : "=d" (_zzq_result)                         \
+                    : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                    : "cc", "2", "3", "memory"                   \
+                   );                                            \
+   _zzq_result;                                                  \
+ })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                      \
+ { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+   volatile unsigned long long int __addr;                       \
+   __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __GET_NR_CONTEXT_CODE                        \
+                    "lgr %0, 3\n\t"                              \
+                    : "=a" (__addr)                              \
+                    :                                            \
+                    : "cc", "3", "memory"                        \
+                   );                                            \
+   _zzq_orig->nraddr = __addr;                                   \
+ }
+
+#define VALGRIND_CALL_NOREDIR_R1                                 \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CALL_NO_REDIR_CODE
+
+#endif /* PLAT_s390x_linux */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h.  NOTE also: inserts
+   the default behaviour equivalance class tag "0000" into the name.
+   See pub_tool_redir.h for details -- normally you don't need to
+   think about this, though. */
+
+/* Use an extra level of macroisation so as to ensure the soname/fnname
+   args are fully macro-expanded before pasting them together. */
+#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZU_,soname,_,fnname)
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZZ_,soname,_,fnname)
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4)                \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0)
+
+#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5)             \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0)
+
+#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0)
+
+#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7)   \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0)
+
+/* ------------------------- x86-{linux,darwin} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin */
+
+/* ------------------------ amd64-{linux,darwin} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* This is all pretty complex.  It's so as to make stack unwinding
+   work reliably.  See bug 243270.  The basic problem is the sub and
+   add of 128 of %rsp in all of the following macros.  If gcc believes
+   the CFA is in %rsp, then unwinding may fail, because what's at the
+   CFA is not what gcc "expected" when it constructs the CFIs for the
+   places where the macros are instantiated.
+
+   But we can't just add a CFI annotation to increase the CFA offset
+   by 128, to match the sub of 128 from %rsp, because we don't know
+   whether gcc has chosen %rsp as the CFA at that point, or whether it
+   has chosen some other register (eg, %rbp).  In the latter case,
+   adding a CFI annotation to change the CFA offset is simply wrong.
+
+   So the solution is to get hold of the CFA using
+   __builtin_dwarf_cfa(), put it in a known register, and add a
+   CFI annotation to say what the register is.  We choose %rbp for
+   this (perhaps perversely), because:
+
+   (1) %rbp is already subject to unwinding.  If a new register was
+       chosen then the unwinder would have to unwind it in all stack
+       traces, which is expensive, and
+
+   (2) %rbp is already subject to precise exception updates in the
+       JIT.  If a new register was chosen, we'd have to have precise
+       exceptions for it too, which reduces performance of the
+       generated code.
+
+   However .. one extra complication.  We can't just whack the result
+   of __builtin_dwarf_cfa() into %rbp and then add %rbp to the
+   list of trashed registers at the end of the inline assembly
+   fragments; gcc won't allow %rbp to appear in that list.  Hence
+   instead we need to stash %rbp in %r15 for the duration of the asm,
+   and say that %r15 is trashed instead.  gcc seems happy to go with
+   that.
+
+   Oh .. and this all needs to be conditionalised so that it is
+   unchanged from before this commit, when compiled with older gccs
+   that don't support __builtin_dwarf_cfa.  Furthermore, since
+   this header file is freestanding, it has to be independent of
+   config.h, and so the following conditionalisation cannot depend on
+   configure time checks.
+
+   Although it's not clear from
+   'defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)',
+   this expression excludes Darwin.
+   .cfi directives in Darwin assembly appear to be completely
+   different and I haven't investigated how they work.
+
+   For even more entertainment value, note we have to use the
+   completely undocumented __builtin_dwarf_cfa(), which appears to
+   really compute the CFA, whereas __builtin_frame_address(0) claims
+   to but actually doesn't.  See
+   https://bugs.kde.org/show_bug.cgi?id=243270#c47
+*/
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"r"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "movq %%rbp, %%r15\n\t"                                     \
+      "movq %2, %%rbp\n\t"                                        \
+      ".cfi_remember_state\n\t"                                   \
+      ".cfi_def_cfa rbp, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "movq %%r15, %%rbp\n\t"                                     \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $136,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $136,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_CFI_PROLOGUE                                    \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CFI_EPILOGUE                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r15"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4","r14"
+
+/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory",  __CALLER_SAVED_REGS         \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #4 \n\t"                                    \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #8 \n\t"                                    \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #12 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "push {r0, r1, r2, r3} \n\t"                             \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #16 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #20 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #24 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #28 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory",__CALLER_SAVED_REGS           \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "ldr r2, [%1, #48] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         "add sp, sp, #32 \n\t"                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------- s390x-linux ------------------------- */
+
+#if defined(PLAT_s390x_linux)
+
+/* Similar workaround as amd64 (see above), but we use r11 as frame
+   pointer and save the old r11 in r7. r11 might be used for
+   argvec, therefore we copy argvec in r1 since r1 is clobbered
+   after the call anyway.  */
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"d"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      ".cfi_remember_state\n\t"                                   \
+      "lgr 1,%1\n\t" /* copy the argvec pointer in r1 */          \
+      "lgr 7,11\n\t"                                              \
+      "lgr 11,%2\n\t"                                             \
+      ".cfi_def_cfa r11, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "lgr 11, 7\n\t"                                             \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "lgr 1,%1\n\t"
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+
+
+
+/* These regs are trashed by the hidden call. Note that we overwrite
+   r14 in s390_irgen_noredir (VEX/priv/guest_s390_irgen.c) to give the
+   function a proper return address. All others are ABI defined call
+   clobbers. */
+#define __CALLER_SAVED_REGS "0","1","2","3","4","5","14", \
+                           "f0","f1","f2","f3","f4","f5","f6","f7"
+
+
+#define CALL_FN_W_v(lval, orig)                                  \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long  _argvec[1];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 1, 0(1)\n\t"  /* target->r1 */                      \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "d" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+/* The call abi has the arguments in r2-r6 and stack */
+#define CALL_FN_W_W(lval, orig, arg1)                            \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[2];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1, arg2)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[3];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1, arg2, arg3)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[4];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1, arg2, arg3, arg4)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[5];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1, arg2, arg3, arg4, arg5)   \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[6];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6)                                       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[7];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-168\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,168\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7)                                 \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[8];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-176\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,176\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8)                           \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[9];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-184\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,184\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8, arg9)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[10];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-192\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,192\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[11];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-200\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,200\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[12];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-208\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,208\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11, arg12)\
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[13];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      _argvec[12] = (unsigned long)arg12;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-216\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "mvc 208(8,15), 96(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "lgr %0, 2\n\t"                                         \
+         "aghi 15,216\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+
+#endif /* PLAT_s390x_linux */
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* Allows a string (gdb monitor command) to be passed to the tool
+             Used for interaction with vgdb/gdb */
+          VG_USERREQ__GDB_MONITOR_COMMAND = 0x1202,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__RESIZEINPLACE_BLOCK = 0x130b,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          /* The first two pass the va_list argument by value, which
+             assumes it is the same size as or smaller than a UWord,
+             which generally isn't the case.  Hence are deprecated.
+             The second two pass the vargs by reference and so are
+             immune to this problem. */
+          /* both :: char* fmt, va_list vargs (DEPRECATED) */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+          /* both :: char* fmt, va_list* vargs */
+          VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403,
+          VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503,
+
+          /* Wine support */
+          VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601,
+
+          /* Querying of debug info. */
+          VG_USERREQ__MAP_IP_TO_SRCLOC = 0x1701,
+
+          /* Disable/enable error reporting level.  Takes a single
+             Word arg which is the delta to this thread's error
+             disablement indicator.  Hence 1 disables or further
+             disables errors, and -1 moves back towards enablement.
+             Other values are not allowed. */
+          VG_USERREQ__CHANGE_ERR_DISABLEMENT = 0x1801
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND                                           \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* if not */,         \
+                                    VG_USERREQ__RUNNING_ON_VALGRIND,  \
+                                    0, 0, 0, 0, 0)                    \
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                                    _qzz_addr, _qzz_len, 0, 0, 0)
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack.  The return value
+   is the number of characters printed, excluding the "**<pid>** " part at the
+   start and the backtrace (if present). */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,       \
+                                    VG_USERREQ__CLIENT_CALL0,     \
+                                    _qyy_fn,                      \
+                                    0, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)                    \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL1,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)         \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL2,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, _qyy_arg2, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,             \
+                                    VG_USERREQ__CLIENT_CALL3,           \
+                                    _qyy_fn,                            \
+                                    _qyy_arg1, _qyy_arg2,               \
+                                    _qyy_arg3, 0)
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(                    \
+                               0 /* default return */,            \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0)
+
+/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing
+   when heap blocks are allocated in order to give accurate results.  This
+   happens automatically for the standard allocator functions such as
+   malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete,
+   delete[], etc.
+
+   But if your program uses a custom allocator, this doesn't automatically
+   happen, and Valgrind will not do as well.  For example, if you allocate
+   superblocks with mmap() and then allocates chunks of the superblocks, all
+   Valgrind's observations will be at the mmap() level and it won't know that
+   the chunks should be considered separate entities.  In Memcheck's case,
+   that means you probably won't get heap block overrun detection (because
+   there won't be redzones marked as unaddressable) and you definitely won't
+   get any leak detection.
+
+   The following client requests allow a custom allocator to be annotated so
+   that it can be handled accurately by Valgrind.
+
+   VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated
+   by a malloc()-like function.  For Memcheck (an illustrative case), this
+   does two things:
+
+   - It records that the block has been allocated.  This means any addresses
+     within the block mentioned in error messages will be
+     identified as belonging to the block.  It also means that if the block
+     isn't freed it will be detected by the leak checker.
+
+   - It marks the block as being addressable and undefined (if 'is_zeroed' is
+     not set), or addressable and defined (if 'is_zeroed' is set).  This
+     controls how accesses to the block by the program are handled.
+   
+   'addr' is the start of the usable block (ie. after any
+   redzone), 'sizeB' is its size.  'rzB' is the redzone size if the allocator
+   can apply redzones -- these are blocks of padding at the start and end of
+   each block.  Adding redzones is recommended as it makes it much more likely
+   Valgrind will spot block overruns.  `is_zeroed' indicates if the memory is
+   zeroed (or filled with another predictable value), as is the case for
+   calloc().
+   
+   VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a
+   heap block -- that will be used by the client program -- is allocated.
+   It's best to put it at the outermost level of the allocator if possible;
+   for example, if you have a function my_alloc() which calls
+   internal_alloc(), and the client request is put inside internal_alloc(),
+   stack traces relating to the heap block will contain entries for both
+   my_alloc() and internal_alloc(), which is probably not what you want.
+
+   For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out
+   custom blocks from within a heap block, B, that has been allocated with
+   malloc/calloc/new/etc, then block B will be *ignored* during leak-checking
+   -- the custom blocks will take precedence.
+
+   VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK.  For
+   Memcheck, it does two things:
+
+   - It records that the block has been deallocated.  This assumes that the
+     block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - It marks the block as being unaddressable.
+
+   VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a
+   heap block is deallocated.
+
+   VALGRIND_RESIZEINPLACE_BLOCK informs a tool about reallocation. For
+   Memcheck, it does four things:
+
+   - It records that the size of a block has been changed.  This assumes that
+     the block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - If the block shrunk, it marks the freed memory as being unaddressable.
+
+   - If the block grew, it marks the new area as undefined and defines a red
+     zone past the end of the new block.
+
+   - The V-bits of the overlap between the old and the new block are preserved.
+
+   VALGRIND_RESIZEINPLACE_BLOCK should be put after allocation of the new block
+   and before deallocation of the old block.
+
+   In many cases, these three client requests will not be enough to get your
+   allocator working well with Memcheck.  More specifically, if your allocator
+   writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call
+   will be necessary to mark the memory as addressable just before the zeroing
+   occurs, otherwise you'll get a lot of invalid write errors.  For example,
+   you'll need to do this if your allocator recycles freed blocks, but it
+   zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK).
+   Alternatively, if your allocator reuses freed blocks for allocator-internal
+   data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary.
+
+   Really, what's happening is a blurring of the lines between the client
+   program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the
+   memory should be considered unaddressable to the client program, but the
+   allocator knows more than the rest of the client program and so may be able
+   to safely access it.  Extra client requests are necessary for Valgrind to
+   understand the distinction between the allocator and the rest of the
+   program.
+
+   Ignored if addr == 0.
+*/
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)          \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MALLOCLIKE_BLOCK,       \
+                                    addr, sizeB, rzB, is_zeroed, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__RESIZEINPLACE_BLOCK,    \
+                                    addr, oldSizeB, newSizeB, rzB, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__FREELIKE_BLOCK,         \
+                                    addr, rzB, 0, 0, 0)
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,   \
+                                    pool, rzB, is_zeroed, 0, 0)
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DESTROY_MEMPOOL,  \
+                                    pool, 0, 0, 0, 0)
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_ALLOC,    \
+                                    pool, addr, size, 0, 0)
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_FREE,     \
+                                    pool, addr, 0, 0, 0)
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_TRIM,     \
+                                    pool, addr, size, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MOVE_MEMPOOL,     \
+                                    poolA, poolB, 0, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_CHANGE,   \
+                                    pool, addrA, addrB, size, 0)
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0)
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0)
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_DEREGISTER, \
+                                    id, 0, 0, 0, 0)
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_CHANGE,     \
+                                    id, start, end, 0, 0)
+
+/* Load PDB debug info for Wine PE image_map. */
+#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__LOAD_PDB_DEBUGINFO, \
+                                    fd, ptr, total_size, delta, 0)
+
+/* Map a code address to a source file name and line number.  buf64
+   must point to a 64-byte buffer in the caller's address space.  The
+   result will be dumped in there and is guaranteed to be zero
+   terminated.  If no info is found, the first byte is set to zero. */
+#define VALGRIND_MAP_IP_TO_SRCLOC(addr, buf64)                    \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MAP_IP_TO_SRCLOC,      \
+                               addr, buf64, 0, 0, 0)
+
+/* Disable error reporting for this thread.  Behaves in a stack like
+   way, so you can safely call this multiple times provided that
+   VALGRIND_ENABLE_ERROR_REPORTING is called the same number of times
+   to re-enable reporting.  The first call of this macro disables
+   reporting.  Subsequent calls have no effect except to increase the
+   number of VALGRIND_ENABLE_ERROR_REPORTING calls needed to re-enable
+   reporting.  Child threads do not inherit this setting from their
+   parents -- they are always created with reporting enabled. */
+#define VALGRIND_DISABLE_ERROR_REPORTING                                \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    1, 0, 0, 0, 0)
+
+/* Re-enable error reporting, as per comments on
+   VALGRIND_DISABLE_ERROR_REPORTING. */
+#define VALGRIND_ENABLE_ERROR_REPORTING                                 \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    -1, 0, 0, 0, 0)
+
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+
+#endif   /* __VALGRIND_H */