From f04e039dca9d5c88dfdcddb8e1ddcd0d55f08393 Mon Sep 17 00:00:00 2001
From: Peter Rutenbar <pruten@gmail.com>
Date: Mon, 10 Nov 2014 20:40:54 -0500
Subject: [PATCH] Promoting newfpu to be the current fpu (aka fpu.c)

Just merged the newfpu branch to master, almost everything
is implemented, but there are clearly lots of problems still.

(According to xcalc, sin(3.1415927) == 0.054803665, which is
pretty far beyond the acceptable error margin.)
---
 core/Makefile |   85 +-
 core/fpu.c    | 4082 ++++++++++++++++++++++++++++++++++---------------
 core/newfpu.c | 3105 -------------------------------------
 core/oldfpu.c | 1539 +++++++++++++++++++
 4 files changed, 4372 insertions(+), 4439 deletions(-)
 delete mode 100644 core/newfpu.c
 create mode 100644 core/oldfpu.c

diff --git a/core/Makefile b/core/Makefile
index 975d695..cd2e969 100644
--- a/core/Makefile
+++ b/core/Makefile
@@ -1,85 +1,18 @@
 
 CC = clang
-CFLAGS = -O3 -ggdb -flto -Wno-deprecated-declarations
-# CFLAGS = -O0 -ggdb -Wno-deprecated-declarations
 
+all: shoebill
 
-DEPS = mc68851.h shoebill.h Makefile macro.pl
-NEED_DECODER = cpu dis
-NEED_PREPROCESSING = adb mc68851 mem via floppy core_api newfpu
-NEED_NOTHING = atrap_tab coff exception macii_symbols redblack scsi video filesystem alloc_pool toby_frame_buffer sound ethernet SoftFloat/softfloat
+shoebill: make_gui debugger
 
-# Object files that can be compiled directly from the source
-OBJ_NEED_NOTHING = $(patsubst %,$(TEMP)/%.o,$(NEED_NOTHING))
+make_gui: make_core
+	xcodebuild -project gui/Shoebill.xcodeproj SYMROOT=build
 
-# Object files than need preprocessing with macro.pl
-OBJ_NEED_PREPROCESSING = $(patsubst %,$(TEMP)/%.o,$(NEED_PREPROCESSING))
+debugger: make_core
+	$(MAKE) -C debugger
 
-# Object files that depend on the instruction decoder
-OBJ_NEED_DECODER = $(patsubst %,$(TEMP)/%.o,$(NEED_DECODER))
-
-# Files that NEED_DECODER also NEED_PREPROCESSING
-POST_PREPROCESSING = $(patsubst %,$(TEMP)/%.post.c,$(NEED_PREPROCESSING)) $(patsubst %,$(TEMP)/%.post.c,$(NEED_DECODER))
-
-
-
-# All the object files compiled for x86_64
-OBJ_x86_64 = $(OBJ_NEED_NOTHING) $(OBJ_NEED_PREPROCESSING) $(OBJ_NEED_DECODER)
-
-# The object files compiled for i386 (the same as x86_64 files, but with .i386 appended)
-OBJ_i386 = $(patsubst %,%.i386,$(OBJ_x86_64))
-
-
-MACRO = perl macro.pl
-
-TEMP = ../intermediates
-
-
-all: $(TEMP)/libshoebill_core.a 
-
-$(TEMP)/libshoebill_core.a: $(TEMP) $(DEPS) $(OBJ_x86_64)
-	libtool -static -v -o $(TEMP)/libshoebill_core.a.x86_64 $(OBJ_x86_64)
-	libtool -static -v -o $(TEMP)/libshoebill_core.a.i386 $(OBJ_i386)
-	lipo -create -output $(TEMP)/libshoebill_core.a $(TEMP)/libshoebill_core.a.x86_64 $(TEMP)/libshoebill_core.a.i386
-
-
-# Split object files into i386/x86_64 versions, since it seems that libtool is unable to 
-# link a static universal library for -O4 object files.
-# x86_64 object files have the form "intermediates/<file_name>.o
-# i386 object files have the form "intermediates/<file_name>.o.i386
-
-# Build object files
-$(OBJ_NEED_NOTHING): $(TEMP)/%.o: %.c $(DEPS)
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-$(OBJ_NEED_PREPROCESSING): $(TEMP)/%.o: $(TEMP)/%.post.c $(DEPS)
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-$(OBJ_NEED_DECODER): $(TEMP)/%.o: $(TEMP)/%.post.c $(DEPS) $(TEMP)/dis_decoder_guts.c $(TEMP)/inst_decoder_guts.c
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-# Preprocess C files 
-$(POST_PREPROCESSING): $(TEMP)/%.post.c: %.c $(DEPS)
-	$(MACRO) $< $@
-
-# Generate instruction decoders
-$(TEMP)/inst_decoder_guts.c: $(TEMP)/decoder_gen $(DEPS)
-	$(TEMP)/decoder_gen inst $(TEMP)/
-$(TEMP)/dis_decoder_guts.c: $(TEMP)/decoder_gen $(DEPS)
-	$(TEMP)/decoder_gen dis $(TEMP)/
-
-# Compile the decoder generator
-$(TEMP)/decoder_gen: decoder_gen.c $(DEPS)
-	$(CC) decoder_gen.c -o $(TEMP)/decoder_gen
-
-
-$(TEMP):
-	mkdir -p $(TEMP)
-	mkdir -p $(TEMP)/SoftFloat
+make_core:
+	$(MAKE) -C core -j 4
 
 clean:
-	rm -rf $(TEMP)
-
+	rm -rf intermediates gui/build
diff --git a/core/fpu.c b/core/fpu.c
index 03b4e73..da7e072 100644
--- a/core/fpu.c
+++ b/core/fpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Peter Rutenbar <pruten@gmail.com>
+ * Copyright (c) 2013-2014, Peter Rutenbar <pruten@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,525 +23,2600 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
-#include <fenv.h>
-#include <float.h>
-#include <math.h>
 #include "../core/shoebill.h"
+#include "../core/SoftFloat/softfloat.h"
 
-extern struct dis_t dis;
-extern uint16_t dis_op;
+#pragma mark Structures and macros
+
+// Mode control byte
+#define mc_rnd  (fpu->fpcr.b._mc_rnd)
+#define mc_prec (fpu->fpcr.b._mc_prec)
+
+// Exception enable byte
+#define ee_inex1 (fpu->fpcr.b._ee_inex1)
+#define ee_inex2 (fpu->fpcr.b._ee_inex2)
+#define ee_dz    (fpu->fpcr.b._ee_dz)
+#define ee_unfl  (fpu->fpcr.b._ee_unfl)
+#define ee_ovfl  (fpu->fpcr.b._ee_ovfl)
+#define ee_operr (fpu->fpcr.b._ee_operr)
+#define ee_snan  (fpu->fpcr.b._ee_snan)
+#define ee_bsun  (fpu->fpcr.b._ee_bsun)
+
+// Accrued exception byte
+#define ae_inex (fpu->fpsr.b._ae_inex)
+#define ae_dz   (fpu->fpsr.b._ae_dz)
+#define ae_unfl (fpu->fpsr.b._ae_unfl)
+#define ae_ovfl (fpu->fpsr.b._ae_ovfl)
+#define ae_iop  (fpu->fpsr.b._ae_iop)
+
+// Exception status byte
+#define es_inex1 (fpu->fpsr.b._es_inex1)
+#define es_inex2 (fpu->fpsr.b._es_inex2)
+#define es_dz    (fpu->fpsr.b._es_dz)
+#define es_unfl  (fpu->fpsr.b._es_unfl)
+#define es_ovfl  (fpu->fpsr.b._es_ovfl)
+#define es_operr (fpu->fpsr.b._es_operr)
+#define es_snan  (fpu->fpsr.b._es_snan)
+#define es_bsun  (fpu->fpsr.b._es_bsun)
+
+// Quotient byte
+#define qu_quotient (fpu->fpsr.b._qu_quotient)
+#define qu_s        (fpu->fpsr.b._qu_s) /* quotient sign */
+
+// Condition codes
+#define cc_nan (fpu->fpsr.b._cc_nan)
+#define cc_i (fpu->fpsr.b._cc_i)
+#define cc_z (fpu->fpsr.b._cc_z)
+#define cc_n (fpu->fpsr.b._cc_n)
 
-#define FPU_JUMP_EMU 0
-#define FPU_JUMP_DIS 1
-typedef void (fpu_func_t)(uint16_t, uint16_t);
 
 typedef struct {
-    fpu_func_t *emu, *dis;
-    const char *name;
-} fpu_inst_t;
+    uint32_t fpiar; // FPU iaddr
+    
+    union { // fpcr, fpu control register
+        struct {
+            // Mode control byte
+            uint16_t _mc_zero : 4; // zero/dummy
+            uint16_t _mc_rnd  : 2; // rounding mode
+            uint16_t _mc_prec : 2; // rounding precision
+            // Exception enable byte
+            uint16_t _ee_inex1 : 1; // inexact decimal input
+            uint16_t _ee_inex2 : 1; // inxact operation
+            uint16_t _ee_dz    : 1; // divide by zero
+            uint16_t _ee_unfl  : 1; // underflow
+            uint16_t _ee_ovfl  : 1; // overflow
+            uint16_t _ee_operr : 1; // operand error
+            uint16_t _ee_snan  : 1; // signalling not a number
+            uint16_t _ee_bsun  : 1; // branch/set on unordered
+        } b;
+        
+        uint16_t raw;
+    } fpcr;
+    
+    union { // fpsr, fpu status register
+        struct {
+            // Accrued exception byte
+            uint32_t _dummy1  : 3; // dummy/zero
+            uint32_t _ae_inex : 1; // inexact
+            uint32_t _ae_dz   : 1; // divide by zero
+            uint32_t _ae_unfl : 1; // underflow
+            uint32_t _ae_ovfl : 1; // overflow
+            uint32_t _ae_iop  : 1; // invalid operation
+            // Exception status byte
+            uint32_t _es_inex1 : 1; // inexact decimal input
+            uint32_t _es_inex2 : 1; // inxact operation
+            uint32_t _es_dz    : 1; // divide by zero
+            uint32_t _es_unfl  : 1; // underflow
+            uint32_t _es_ovfl  : 1; // overflow
+            uint32_t _es_operr : 1; // operand error
+            uint32_t _es_snan  : 1; // signalling not a number
+            uint32_t _es_bsun  : 1; // branch/set on unordered
+            // Quotient byte
+            uint32_t _qu_quotient : 7;
+            uint32_t _qu_s        : 1;
+            // Condition code byte
+            uint32_t _cc_nan  : 1; // not a number
+            uint32_t _cc_i    : 1; // infinity
+            uint32_t _cc_z    : 1; // zero
+            uint32_t _cc_n    : 1; // negative
+            uint32_t _dummy2  : 4; // dummy/zero
+        } b;
+        uint32_t raw;
+    } fpsr;
+    
+    floatx80 fp[8]; // 80 bit floating point general registers
+    
+    // State for the static fmath instruction implementations
+    float128 source, dest, result;
+    _Bool write_back;
+    uint8_t fmath_op;
+} fpu_state_t;
 
-~newmacro(create_fpu_jump_table, 0, {
-    my $names = [
-        'unknown', 'fabs', 'facos', 'fadd', 'fasin', 'fatan', 'fatanh', 'fbcc', 'fcmp', 'fcos', 'fcosh',
-        'fdbcc', 'fdiv', 'fetox', 'fetoxm1', 'fgetexp', 'fgetman', 'fint', 'fintrz', 'flog10', 'flog2',
-        'flogn', 'flognp1', 'fmod', 'fmove', 'fmovecr', 'fmovem', 'fmovem_control', 'fmul', 'fneg', 'fnop',
-        'frem', 'frestore', 'fsave', 'fscale', 'fscc', 'fsgldiv', 'fsglmul', 'fsin', 'fsincos', 'fsinh',
-        'fsqrt', 'fsub', 'ftan', 'ftanh', 'ftentox', 'ftrapcc', 'ftst', 'ftwotox'
-    ];
-    my $fpu_enum = "typedef enum {\n";
-    foreach my $n (@$names) {
-        $fpu_enum .= "\tfpu_inst_$n,\n";
-    }
-    $fpu_enum .= "\nfpu_inst_max} fpu_inst_name_t;";
-    
-    my $fpu_table = "fpu_inst_t fpu_inst_table[fpu_inst_max] = {\n";
-    foreach my $n (@$names) {
-        $fpu_table .= "\t{NULL, NULL, \"" . $n . "\"},\n";
-    }
-    $fpu_table = substr($fpu_table, 0, -2);
-    $fpu_table .= "\n};";
-    
-    my $out = "$fpu_enum \n $fpu_table \n";
-    return $out;
-})
+enum rounding_precision_t {
+    prec_extended = 0,
+    prec_single = 1,
+    prec_double = 2,
+};
 
-~create_fpu_jump_table()
+enum rounding_mode_t {
+    mode_nearest = 0,
+    mode_zero = 1,
+    mode_neg = 2,
+    mode_pos = 3
+};
 
-static fpu_inst_name_t fpu_decode_op(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 ttt MMMMMM);
-    
-    if (t) {
-        switch (t) {
-            case 1:
-                if ((M>>3) == 1)
-                    return fpu_inst_fdbcc;
-                else if ((M>>3) == 7)
-                    return fpu_inst_ftrapcc;
-                return fpu_inst_fscc;
-            case 2:
-                if (M==0 && ext == 0)
-                    return fpu_inst_fnop; // same as fbf.w
-                // fall through
-            case 3:
-                return fpu_inst_fbcc;
-            case 4:
-                return fpu_inst_fsave;
-            case 5:
-                return fpu_inst_frestore;
-        }
-        return fpu_inst_unknown;
-    }
-    
-    ~decompose(ext, ccc xxx yyy eeeeeee)
-    
-    switch (c) {
-        case 0: // Reg to reg
-            break;
-        case 1: // unused
-            return fpu_inst_unknown;
-        case 2: // Memory->reg & movec
-            break;
-            
-        case 3: // reg->mem
-            return fpu_inst_fmove;
-            
-        case 4: // mem -> sys ctl registers
-        case 5: // sys ctl registers -> mem
-            return fpu_inst_fmovem_control;
-            
-        case 6: // movem to fp registers
-        case 7: // movem to memory
-            return fpu_inst_fmovem;
-    }
-    
-    // Here c == 0b000 or 010
-    
-    if (M == 0 && ~bmatch(ext, 010 111 xxx xxxxxxx))
-        return fpu_inst_fmovecr;
-    
-    if ((e>>3) == ~b(0110))
-        return fpu_inst_fsincos;
-    
-    switch (e) {
-            
-        case ~b(0000000):
-        case ~b(1000000):
-        case ~b(1000100):
-            return fpu_inst_fmove;
-            
-        case ~b(0000001): return fpu_inst_fint;
-        case ~b(0000010): return fpu_inst_fsinh;
-        case ~b(0000011): return fpu_inst_fintrz;
-        case ~b(0000110): return fpu_inst_flognp1;
-        case ~b(0001000): return fpu_inst_fetoxm1;
-        case ~b(0001001): return fpu_inst_ftanh;
-        case ~b(0001010): return fpu_inst_fatan;
-        case ~b(0001100): return fpu_inst_fasin;
-        case ~b(0001101): return fpu_inst_fatanh;
-        case ~b(0001110): return fpu_inst_fsin;
-        case ~b(0001111): return fpu_inst_ftan;
-        case ~b(0010000): return fpu_inst_fetox;
-        case ~b(0010001): return fpu_inst_ftwotox;
-        case ~b(0010010): return fpu_inst_ftentox;
-        case ~b(0010100): return fpu_inst_flogn;
-        case ~b(0010101): return fpu_inst_flog10;
-        case ~b(0010110): return fpu_inst_flog2;
-        case ~b(0011001): return fpu_inst_fcosh;
-        case ~b(0011100): return fpu_inst_facos;
-        case ~b(0011101): return fpu_inst_fcos;
-        case ~b(0011110): return fpu_inst_fgetexp;
-        case ~b(0011111): return fpu_inst_fgetman;
-        case ~b(0100001): return fpu_inst_fmod;
-        case ~b(0100100): return fpu_inst_fsgldiv;
-        case ~b(0100111): return fpu_inst_fsglmul;
-        case ~b(0100101): return fpu_inst_frem;
-        case ~b(0100110): return fpu_inst_fscale;
-        case ~b(0111000): return fpu_inst_fcmp;
-        case ~b(0111010): return fpu_inst_ftst;
-            
-        case ~b(0011000):
-        case ~b(1011000):
-        case ~b(1011100):
-            return fpu_inst_fabs;
-            
-        case ~b(0100010):
-        case ~b(1100010):
-        case ~b(1100110):
-            return fpu_inst_fadd;
-            
-        case ~b(0100000):
-        case ~b(1100000):
-        case ~b(1100100):
-            return fpu_inst_fdiv;
-            
-            
-        case ~b(0100011):
-        case ~b(1100011):
-        case ~b(1100111):
-            return fpu_inst_fmul;
-            
-        case ~b(0011010):
-        case ~b(1011010):
-        case ~b(1011110):
-            return fpu_inst_fneg;
-            
-        case ~b(0000100):
-        case ~b(1000001):
-        case ~b(1000101):
-            return fpu_inst_fsqrt;
-            
-        case ~b(0101000):
-        case ~b(1101000):
-        case ~b(1101100):
-            return fpu_inst_fsub;
-    }
-    
-    return fpu_inst_unknown;
-    
-}
+/*
+ * 0 L     long word integer
+ * 1 S     single precision real
+ * 2 X     extended precision real
+ * 3 P{#k} packed decimal real with static k factor
+ * 4 W     word integer
+ * 5 D     double precision real
+ * 6 B     byte integer
+ * 7 P{Dn} packed decimal real with dynamic k factor
+ */
+static const uint8_t _format_sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+enum {
+    format_L = 0,
+    format_S = 1,
+    format_X = 2,
+    format_Ps = 3,
+    format_W = 4,
+    format_D = 5,
+    format_B = 6,
+    format_Pd = 7
+} fpu_formats;
 
+#define fpu_get_state_ptr() fpu_state_t *fpu = (fpu_state_t*)shoe.fpu_state
 #define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
+#define nextlong() ({const uint32_t L=lget(shoe.pc,4); if (shoe.abort) {return;}; shoe.pc+=4; L;})
 #define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
 
-void dis_fpu_decode ()
+#pragma mark FPU exception stuff
+enum fpu_vector_t {
+    fpu_vector_ftrapcc = 7,
+    fpu_vector_fline = 11,
+    fpu_vector_coprocessor_protocol_violation = 13, // won't be using this one
+    fpu_vector_bsun = 48,
+    fpu_vector_inexact = 49,
+    fpu_vector_divide_by_zero = 50,
+    fpu_vector_underflow = 51,
+    fpu_vector_operr = 52,
+    fpu_vector_overflow = 53,
+    fpu_vector_snan = 54
+};
+
+/*
+ * Map the exception bit positions (in fpsr and fpcr)
+ * to their corresponding exception vector numbers.
+ */
+const uint8_t _exception_bit_to_vector[8] = {
+    48, // bsun
+    54, // snan
+    52, // operr
+    53, // ovfl
+    51, // unfl
+    50, // dz
+    49, // inex2
+    49, // inex1
+};
+
+static void throw_fpu_pre_instruction_exception(enum fpu_vector_t vector)
 {
-    ~decompose(dis_op, 1111 001 xxx 000000);
+    throw_frame_zero(shoe.orig_sr, shoe.orig_pc, vector);
+}
+/*
+ * Note: I may be able to get away without implementing the
+ *       mid-instruction exception.
+ */
+
+/*
+ * _bsun_test() is called by every inst_f*cc instruction
+ * to test whether the bsun exception is enabled, throw an
+ * exception if so, and otherwise just set the appropriate
+ * bit in fpsr, and update the accrued exception byte.
+ */
+static _Bool _bsun_test()
+{
+    fpu_get_state_ptr();
     
-    fpu_inst_name_t name;
-    uint16_t ext = 0;
+    // BSUN counts against the IOP accrued exception bit
+    ae_iop = 1;
     
-    if (x == 4)
-        name = fpu_inst_fsave;
-    else if (x == 5)
-        name = fpu_inst_frestore;
-    else {
-        ext = dis_next_word();
-        name = fpu_decode_op(dis_op, ext);
+    // Set the BSUN exception status bit
+    es_bsun = 1;
+    
+    // If the BSUN exception isn't enabled, then we can just return
+    if (!ee_bsun)
+        return 0; // 0 -> elected not to throw an exception
+    
+    throw_fpu_pre_instruction_exception(fpu_vector_bsun);
+    return 1;
+}
+
+static void _throw_illegal_instruction()
+{
+    assert(!"throw_illegal_instruction!");
+}
+
+#pragma mark Float format translators (to/from big-endian motorola format)
+
+static void _floatx80_to_int8(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = tmp & 0xff;
+}
+
+static void _floatx80_to_int16(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = (tmp >> 8) & 0xff;
+    ptr[1] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_int32(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = (tmp >> 24) & 0xff;
+    ptr[1] = (tmp >> 16) & 0xff;
+    ptr[2] = (tmp >> 8) & 0xff;
+    ptr[3] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_single(floatx80 *f, uint8_t *ptr)
+{
+    const float32 tmp = floatx80_to_float32(*f);
+    ptr[0] = (tmp >> 24) & 0xff;
+    ptr[1] = (tmp >> 16) & 0xff;
+    ptr[2] = (tmp >> 8) & 0xff;
+    ptr[3] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_double(floatx80 *f, uint8_t *ptr)
+{
+    const float64 tmp = floatx80_to_float64(*f);
+    ptr[0] = (tmp >> 56) & 0xff;
+    ptr[1] = (tmp >> 48) & 0xff;
+    ptr[2] = (tmp >> 40) & 0xff;
+    ptr[3] = (tmp >> 32) & 0xff;
+    ptr[4] = (tmp >> 24) & 0xff;
+    ptr[5] = (tmp >> 16) & 0xff;
+    ptr[6] = (tmp >> 8) & 0xff;
+    ptr[7] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_extended(floatx80 *f, uint8_t *ptr)
+{
+    ptr[0] = (f->high >> 8) & 0xff;
+    ptr[1] = (f->high >> 0) & 0xff;
+    ptr[2] = 0;
+    ptr[3] = 0;
+    ptr[4] = (f->low >> 56) & 0xff;
+    ptr[5] = (f->low >> 48) & 0xff;
+    ptr[6] = (f->low >> 40) & 0xff;
+    ptr[7] = (f->low >> 32) & 0xff;
+    ptr[8] = (f->low >> 24) & 0xff;
+    ptr[9] = (f->low >> 16) & 0xff;
+    ptr[10] = (f->low >> 8) & 0xff;
+    ptr[11] = (f->low >> 0) & 0xff;
+}
+
+static float128 _int8_to_intermediate(int8_t byte)
+{
+    return int32_to_float128((int32_t)byte);
+}
+
+static float128 _int16_to_intermediate(int16_t sh)
+{
+    return int32_to_float128((int32_t)sh);
+}
+
+static float128 _int32_to_intermediate(int32_t in)
+{
+    return int32_to_float128(in);
+}
+
+static float128 _single_to_intermediate(uint32_t f)
+{
+    assert(sizeof(uint32_t) == sizeof(float32));
+    return float32_to_float128((float32)f);
+}
+
+/*
+ * _double_to_intermediate(d): d needs to be 68k-native order (8 bytes)
+ */
+static float128 _double_to_intermediate(uint8_t *d)
+{
+    assert(sizeof(uint64_t) == sizeof(float64));
+    
+    return float64_to_float128((float64) ntohll(*(uint64_t*)d));
+}
+
+/*
+ * _extended_to_intermediate(e): e needs to be 68k-native order (12 bytes)
+ */
+static float128 _extended_to_intermediate(uint8_t *e)
+{
+    /*
+     * softfloat floatx80 format:
+     * uint64_t low; // the low part of the extended float (significand, low exponent bits)
+     * uint16_t high; // the high part, sign, high exponent bits
+     */
+    floatx80 x80 = {
+        .high = (e[0] << 8) | e[1],
+        .low = ntohll(*(uint64_t*)&e[4])
+    };
+    return floatx80_to_float128(x80);
+}
+
+static void _extended_to_floatx80(uint8_t *bytes, floatx80 *f)
+{
+    f->high = (bytes[0] << 8) | bytes[1];
+    f->low = ntohll(*(uint64_t*)&bytes[4]);
+}
+
+/*
+ * Set softfloat's rounding mode
+ * (fpcr.mc_rnd and softfloat use different values for these modes)
+ */
+static void _set_rounding_mode(enum rounding_mode_t mode)
+{
+    const int8_t rounding_map[4] = {
+        float_round_nearest_even, float_round_to_zero,
+        float_round_up, float_round_down
+    };
+    
+    float_rounding_mode = rounding_map[mode];
+}
+
+#pragma mark EA routines
+
+/*
+ * Read-commit merely updates the address register
+ * for pre/post-inc/decrement
+ */
+static void _fpu_read_ea_commit(const uint8_t format)
+{
+    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
+    
+    if (m == 3) // post-increment
+        shoe.a[r] += _format_sizes[format];
+    else if (m == 4) // pre-decrement
+        shoe.a[r] -= _format_sizes[format];
+    
+    /* 
+     * Note: still unsure about what happens when
+     *       mode=pre/postincdecrement, size==1, and register==a7
+     *       (is the change +-2 bytes? or 1?)
+     */
+    if (((m == 3) || (m == 4)) && (_format_sizes[format] == 1) && (r == 7))
+        assert(!"size==1, reg==a7");
+}
+
+static void _fpu_write_ea(uint8_t mr, uint8_t format, floatx80 *f, uint8_t K)
+{
+    fpu_get_state_ptr();
+    
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t size = _format_sizes[format];
+    uint8_t buf[12], *ptr = &buf[0];
+    uint32_t addr, i;
+    
+    if ((m == 1) ||
+        ((m == 0) && (size > 4))) {
+        /* If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice */
+        _throw_illegal_instruction();
+        return ;
     }
-    
-    if (fpu_inst_table[name].dis) {
-        (*fpu_inst_table[name].dis)(dis_op, ext);
+    else if ((m == 7) && (r > 1)) {
+        /* If this is otherwise an illegal addr mode... */
+        _throw_illegal_instruction();
         return ;
     }
     
-    sprintf(dis.str, "%s ???", fpu_inst_table[name].name);
-}
-
-void inst_fpu_decode ()
-{
-    ~decompose(shoe.op, 1111 001 xxx 000000);
+    const _Bool is_nan = ((f->high << 1) == 0xfffe) && f->low;
     
-    fpu_inst_name_t name;
-    uint16_t ext = 0;
+    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, 666.0L, format);
     
-    if (x == 4)
-        name = fpu_inst_fsave;
-    else if (x == 5)
-        name = fpu_inst_frestore;
-    else {
-        ext = nextword();
-        name = fpu_decode_op(shoe.op, ext);
-        // "For FPCP instructions that generate FPU exceptions,
-        //  FPIAR is loaded with the address of an instruction before it's executed,
-        //  unless all arithmetic exceptions are disabled."
-        // My take: set fpiar for all instructions except fsave, frestore, and fmovem_control
-        if (name != fpu_inst_fmovem_control)
-            shoe.fpiar = shoe.orig_pc;
+    /* Initialize softfloat's exceptions bits/rounding mode */
+    
+    float_exception_flags = 0;
+    _set_rounding_mode(mc_rnd);
+    
+    /* Convert to the appropriate format */
+    
+    switch (format) {
+        case format_B: {
+            _floatx80_to_int8(f, ptr);
+            break;
+        }
+        case format_W: {
+            _floatx80_to_int16(f, ptr);
+            break;
+        }
+        case format_L: {
+            _floatx80_to_int32(f, ptr);
+            break;
+        }
+        case format_S: {
+            _floatx80_to_single(f, ptr);
+            break;
+        }
+        case format_D: {
+            _floatx80_to_double(f, ptr);
+            break;
+        }
+        case format_X: {
+            _floatx80_to_extended(f, ptr);
+            break;
+        }
+        default: {
+            assert(!"unsupported format (packed something!)");
+        }
     }
     
-    if (fpu_inst_table[name].emu) {
-        (*fpu_inst_table[name].emu)(shoe.op, ext);
-        return ;
+    /* Write to memory */
+    
+    switch (m) {
+        case 0: {
+            if (format == format_B)
+                set_d(r, ptr[0], 1);
+            else if (format == format_W)
+                set_d(r, ntohs(*(uint16_t*)ptr), 2);
+            else if ((format == format_L) || (format == format_S))
+                set_d(r, ntohl(*(uint32_t*)ptr), 4);
+            else
+                assert(!"how did I get here?");
+            goto done;
+        }
+        case 1:
+            assert(!"how did I get here again!");
+            
+        case 2:
+            addr = shoe.a[r];
+            break;
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && size==1));
+            break;
+        case 4: // pre-decrement
+            addr = shoe.a[r] - size;
+            assert(!( r==7 && size==1));
+            break;
+        default:
+            call_ea_addr(mr);
+            addr = (uint32_t)shoe.dat;
+            break;
     }
     
-    slog("inst_fpu_decode: unhandled instruction: %s op=0x%04x ext = 0x%04x pc=0x%08x\n", fpu_inst_table[name].name, shoe.op, ext, shoe.orig_pc);
-    assert(!"unknown fpu inst");
-    //dbg_state.running = 0;
+    /* Copy the formatted data into *addr */
     
-}
-
-
-void dis_fsave(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 100 MMMMMM);
-    ~decompose(op, 1111 001 100 mmmrrr);
+    slog("inst_f  fpu_write_ea: addr=0x08x data=0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+         addr,
+         ptr[0], ptr[1], ptr[2], ptr[3],
+         ptr[4], ptr[5], ptr[6], ptr[7],
+         ptr[8], ptr[9], ptr[10], ptr[11]);
     
-    if (m == 4)
-        sprintf(dis.str, "fsave -(a%u)", r);
-    else
-        sprintf(dis.str, "fsave %s", decode_ea_addr(M));
-}
-
-void inst_fsave(uint16_t op, uint16_t ext)
-{
-    verify_supervisor();
-    
-    ~decompose(op, 1111 001 100 MMMMMM);
-    ~decompose(op, 1111 001 100 mmmrrr);
-    
-    const uint32_t size = 0x1c; // IDLE frame
-    const uint16_t frame_header = 0xfd18;
-    uint32_t addr;
-    
-    if (m == 4)
-        addr = shoe.a[r] - size;
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
+    for (i=0; i<size; i++) {
+        lset(addr + i, 1, buf[i]);
+        if (shoe.abort) return ;
     }
     
-    lset(addr, 2, frame_header);
-    if (shoe.abort)
-        return ;
     
-    if (m == 4)
-        shoe.a[r] = addr;
-}
-
-void dis_frestore(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 101 MMMMMM);
-    ~decompose(op, 1111 001 101 mmmrrr);
+done:
+    /* 
+     * Set exception bits and update pre/post/blah registers.
+     * note: condition codes are not modified
+     */
     
-    if (m == 3)
-        sprintf(dis.str, "frestore (a%u)+", r);
-    else
-        sprintf(dis.str, "frestore %s", decode_ea_addr(M));
-}
-
-void inst_frestore(uint16_t op, uint16_t ext)
-{
-    verify_supervisor();
+    es_bsun = 0;
+    es_snan = 0;
+    es_operr = 0;
+    es_ovfl = 0;
+    es_unfl = 0;
+    es_dz = 0;
+    es_inex2 = 0;
+    es_inex1 = 0;
     
-    ~decompose(op, 1111 001 101 MMMMMM);
-    ~decompose(op, 1111 001 101 mmmrrr);
-    
-    uint32_t addr, size;
-    
-    if (m == 3)
-        addr = shoe.a[r];
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    const uint16_t word = lget(addr, 2);
-    if (shoe.abort) return ;
-    
-    // XXX: These frame sizes are different on 68881/68882/68040
-    if ((word & 0xff00) == 0x0000)
-        size = 4; // NULL state frame
-    else if ((word & 0xff) == 0x0018)
-        size = 0x1c; // IDLE state frame
-    else if ((word & 0xff) == 0x00b4)
-        size = 0xb8; // BUSY state frame
-    else {
-        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
-        assert("inst_frestore: bad state frame");
-        return ;
-    }
-    
-    if (m==3) {
-        shoe.a[r] += size;
-        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
-    }
-}
-
-typedef struct {
-    uint8_t inexact;
-    uint8_t dat[4][12];
-} fmovecr_t;
-
-fmovecr_t fmovecr_pi = {1, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, };
-fmovecr_t fmovecr_log10_2 = {1, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x99, };
-fmovecr_t fmovecr_e = {1, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9b, };
-fmovecr_t fmovecr_log2_e = {1, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, };
-fmovecr_t fmovecr_log10_e = {0, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, };
-fmovecr_t fmovecr_zero = {0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_ln_2 = {1, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, };
-fmovecr_t fmovecr_ln_10 = {1, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, };
-fmovecr_t fmovecr_10_0 = {0, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_1 = {0, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_2 = {0, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_4 = {0, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_8 = {0, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_16 = {0, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_32 = {1, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, };
-fmovecr_t fmovecr_10_64 = {1, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd6, };
-fmovecr_t fmovecr_10_128 = {1, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, };
-fmovecr_t fmovecr_10_256 = {1, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, };
-fmovecr_t fmovecr_10_512 = {1, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, };
-fmovecr_t fmovecr_10_1024 = {1, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x18, };
-fmovecr_t fmovecr_10_2048 = {1, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe6, };
-fmovecr_t fmovecr_10_4096 = {1, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, };
-
-const int _fpu_round_map[4] = {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
-#define fpu_set_round() assert(0 == fesetround(_fpu_round_map[shoe.fpcr.b.mc_rnd]))
-#define fpu_reset_round() assert(0 == fesetround(FE_TONEAREST))
-
-static void fpu_set_cc(long double f)
-{
-    // Set condition codes
-    shoe.fpsr.raw &= 0x00ffffff;
-    shoe.fpsr.b.cc_nan = (0 != isnan(f));
-    if (!shoe.fpsr.b.cc_nan) {
-        shoe.fpsr.b.cc_n = (0 != signbit(f));
-        if (isinf(f))
-            shoe.fpsr.b.cc_i = 1;
-        else
-            shoe.fpsr.b.cc_z = (f == 0.0);
-    }
-}
-
-static long double fpu_set_reg(long double f, uint8_t r)
-{
-    // Round the number according to the mode control byte
-    {
-        fpu_set_round();
+    switch (format) {
+        format_B:
+        format_W:
+        format_L:
+            /* Set snan, operr, and/or inex2 */
+            es_snan = is_nan;
+            es_operr = ((float_exception_flags & float_flag_invalid) != 0);
+            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
+            break;
         
-        if (shoe.fpcr.b.mc_prec == 1) {
-            const float tmp = (float)f;
-            f = tmp;
-        } else if (shoe.fpcr.b.mc_prec == 2) {
-            const double tmp = (double)f;
-            f = tmp;
+        format_S:
+        format_D:
+        format_X:
+            /* Set snan, ovfl, unfl, and/or inex2 */
+            es_snan = is_nan;
+            es_ovfl = ((float_exception_flags & float_flag_overflow) != 0);
+            es_unfl = ((float_exception_flags & float_flag_underflow) != 0);
+            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
+            break;
+            
+        format_Pd:
+        format_Ps:
+            /* Set snan, operr, and/or inex2 */
+            assert(!"you better implement packed formats");
+            break;
+    }
+    
+    /* Update the accrued exception bits */
+    ae_iop |= es_bsun | es_snan | es_operr;
+    ae_ovfl |= es_ovfl;
+    ae_unfl |= (es_unfl & es_inex2); // yes, &
+    ae_dz |= es_dz;
+    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
+    
+    /* Are any exceptions both set and enabled? */
+    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
+        /*
+         * Then we need to throw an exception.
+         * The exception is sent to the vector for
+         * the highest priority exception, and the priority
+         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
+         * (which is the order of the bits in fpsr/fpcr).
+         * Iterate over the bits in order, and throw the
+         * exception to whichever bit is set first.
+         */
+        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
+        
+        assert(throwable);
+        for (i=0; 1; i++) {
+            if (throwable & 0x80)
+                break;
+            throwable <<= 1;
         }
         
-        fpu_reset_round();
+        /*
+         * Convert the exception bit position
+         * to the correct vector number, and throw
+         * a (pre-instruction) exception.
+         */
+        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
+        
+        return ;
     }
     
-    // Store it
-    shoe.fp[r] = f;
-    return f;
+    /* Finalize registers, and we're done */
+    
+    if (m == 3)
+        shoe.a[r] += size;
+    else if (m == 4)
+        shoe.a[r] -= size;
 }
 
-// fpu_set_reg_cc() and fpu_set_ea() set the condition codes. (what else should they set?)
-static void fpu_set_reg_cc(long double f, uint8_t r)
+/*
+ * Note: fpu_read_ea modifies shoe.pc, and fpu_read_ea_commit
+ *        modifies shoe.a[x] for pre/post-inc/decrement
+ * Returns false if we're aborting
+ */
+static _Bool _fpu_read_ea(const uint8_t format, float128 *result)
 {
-    fpu_set_cc(fpu_set_reg(f, r));
-}
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
+    
+    const uint8_t size = _format_sizes[format];
+    uint32_t addr = 0;
+    
+    /*
+     * Step 1: find the effective address, store it in addr
+     *         (or the actual data, if unavailable)
+     */
+    
+    slog("FPU: read_ea: mr=%u%u f=%c ", m, r, "lsxpwdb?"[format]);
+    
+    switch (m) {
+        case 0:
+            if (format == format_S)
+                *result = _single_to_intermediate(shoe.d[r]);
+            else if (format == format_B)
+                *result = _int8_to_intermediate(shoe.d[r] & 0xff);
+            else if (format == format_W)
+                *result = _int16_to_intermediate(shoe.d[r] & 0xffff);
+            else if (format == format_L)
+                *result = int32_to_float128(shoe.d[r]);
+            else {
+                /*
+                 * No other format can be used with a data register
+                 * (because they require >4 bytes)
+                 */
+                _throw_illegal_instruction();
+                return 0;
+            }
+            slog("raw=0x%x", chop(shoe.d[r], size));
+            goto got_data;
+            
+        case 1:
+            /* Address regisers can't be used */
+            _throw_illegal_instruction();
+            return 0;
+        
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && size==1));
+            goto got_address;
+            
+        case 4:
+            addr = shoe.a[r] - size;
+            assert(!( r==7 && size==1));
+            goto got_address;
+            
+        case 7:
+            if (r == 4) {
+                addr = shoe.pc;
+                shoe.pc += size;
+                goto got_address;
+            }
+            
+            // fall through to default:
+            
+        default: {
+            ~decompose(shoe.op, 0000 0000 00 MMMMMM);
+            shoe.mr = M;
+            ea_addr();
+            if (shoe.abort)
+                return 0;
+            
+            addr = (uint32_t)shoe.dat;
+            goto got_address;
+        }
 
-static void x87_to_motorola(long double x87, uint8_t motorola[12])
-{
-    uint8_t *x87_ptr = (uint8_t*)&x87;
-    motorola[0] = x87_ptr[9];
-    motorola[1] = x87_ptr[8];
-    motorola[2] = 0;
-    motorola[3] = 0;
-    motorola[4] = x87_ptr[7];
-    motorola[5] = x87_ptr[6];
-    motorola[6] = x87_ptr[5];
-    motorola[7] = x87_ptr[4];
-    motorola[8] = x87_ptr[3];
-    motorola[9] = x87_ptr[2];
-    motorola[10] = x87_ptr[1];
-    motorola[11] = x87_ptr[0];
-}
-
-static long double motorola_to_x87(const uint8_t motorola[12])
-{
-    uint8_t x87[12];
-    
-    x87[11] = 0;
-    x87[10] = 0;
-    x87[9] = motorola[0];
-    x87[8] = motorola[1];
-    
-    x87[7] = motorola[4];
-    x87[6] = motorola[5];
-    x87[5] = motorola[6];
-    x87[4] = motorola[7];
-    x87[3] = motorola[8];
-    x87[2] = motorola[9];
-    x87[1] = motorola[10];
-    x87[0] = motorola[11];
-    return *(long double*)&x87[0];
-}
-
-void inst_fmovecr(uint16_t op, uint16_t ext)
-{
-    ~decompose(ext, 010111 rrr xxxxxxx);
-    
-    fmovecr_t *c = &fmovecr_zero;
-    
-    switch (x) {
-        case 0x00: c = &fmovecr_pi; break;
-        case 0x0b: c = &fmovecr_log10_2; break;
-        case 0x0c: c = &fmovecr_e; break;
-        case 0x0d: c = &fmovecr_log2_e; break;
-        case 0x0e: c = &fmovecr_log10_e; break;
-        case 0x0f: c = &fmovecr_zero; break;
-        case 0x30: c = &fmovecr_ln_2; break;
-        case 0x31: c = &fmovecr_ln_10; break;
-        case 0x32: c = &fmovecr_10_0; break;
-        case 0x33: c = &fmovecr_10_1; break;
-        case 0x34: c = &fmovecr_10_2; break;
-        case 0x35: c = &fmovecr_10_4; break;
-        case 0x36: c = &fmovecr_10_8; break;
-        case 0x37: c = &fmovecr_10_16; break;
-        case 0x38: c = &fmovecr_10_32; break;
-        case 0x39: c = &fmovecr_10_64; break;
-        case 0x3a: c = &fmovecr_10_128; break;
-        case 0x3b: c = &fmovecr_10_256; break;
-        case 0x3c: c = &fmovecr_10_512; break;
-        case 0x3d: c = &fmovecr_10_1024; break;
-        case 0x3e: c = &fmovecr_10_2048; break;
-        case 0x3f: c = &fmovecr_10_4096; break;
     }
     
-    // The constants in the 68881's ROM must be in the "intermediate" format, because they're rounded differently based on fpcr.rnd
-    const long double f = motorola_to_x87(c->dat[shoe.fpcr.b.mc_rnd]);
+got_address:
     
-    fpu_set_reg_cc(f, r);
+    /*
+     * Step 2: Load the data from the effective address
+     */
     
-    slog("inst_fmovecr: set fp%u=%.30Lg\n", r, shoe.fp[r]);
+    slog("raw=0x");
+    if (size <= 4) {
+        const uint32_t raw = lget(addr, size);
+        if (shoe.abort)
+            return 0;
+        printf("%x ", raw);
+        switch (format) {
+            case format_B:
+                *result = _int8_to_intermediate(raw & 0xff);
+                break;
+            case format_W:
+                *result = _int16_to_intermediate(raw & 0xffff);
+                break;
+            case format_L:
+                *result = _int32_to_intermediate(raw);
+                break;
+            case format_S:
+                *result = _single_to_intermediate(raw);
+                break;
+            default:
+                assert(0); /* never get here */
+        }
+    }
+    else { // if (size > 4) -> if format is double, extended, or packed
+        uint8_t buf[12];
+        uint32_t i;
+        
+        for (i = 0; i < size; i++) {
+            buf[i] = lget(addr + i, 1);
+            slog("%02x", buf[i]);
+            if (shoe.abort)
+                return 0;
+        }
+        
+        switch (format) {
+            case format_D:
+                *result = _double_to_intermediate(buf);
+                break;
+            case format_X:
+                *result = _extended_to_intermediate(buf);
+                break;
+            case format_Ps:
+            // case format_Pd: // not possible as a src specifier
+                // FIXME: implement packed formats
+                assert(!"Somebody tried to use a packed format!\n");
+                // _throw_illegal_instruction();
+                // return 0;
+            default:
+                assert(0); // never get here
+        }
+    }
     
-    // fpu_finalize_exceptions();
+got_data:
+    printf("\n");
+    return 1;
 }
 
-void dis_fmovecr(uint16_t op, uint16_t ext)
+#pragma mark Hacky low-precision transcendental implementations
+/*
+ * s -> sign, e -> biased exponent
+ * ma -> 48 high bits of the mantissa
+ * mb -> 64 low bits of the mantissa
+ */
+#define _assemble_float128(s, e, ma, mb) ({ \
+    const uint64_t _ma = (ma), _mb = (mb); \
+    const uint64_t _e = (e), _s = (s); \
+    float128 f = { \
+        .high = ((_s != 0) << 16) | (_e & 0x7fff), \
+        .low = _mb \
+    }; \
+    f.high = ((f.high) << 48) | _ma; \
+    f; \
+})
+
+#define HACKY_MATH_X86
+#ifdef HACKY_MATH_X86
+#define NATIVE double
+
+double _to_native(float128 f128)
 {
-    ~decompose(ext, 010111 rrr xxxxxxx);
-    
-    sprintf(dis.str, "fmovecr.x 0x%02x,fp%u", x, r);
+    float64 f64 = float128_to_float64(f128);
+    double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    ptr[7] = (f64 >> 56) & 0xff;
+    ptr[6] = (f64 >> 48) & 0xff;
+    ptr[5] = (f64 >> 40) & 0xff;
+    ptr[4] = (f64 >> 32) & 0xff;
+    ptr[3] = (f64 >> 24) & 0xff;
+    ptr[2] = (f64 >> 16) & 0xff;
+    ptr[1] = (f64 >> 8) & 0xff;
+    ptr[0] = (f64 >> 0) & 0xff;
+    return result;
 }
 
-void inst_fmovem_control(uint16_t op, uint16_t ext)
+float128 _from_native(double n)
 {
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
+    float64 f64 = 0;
+    uint8_t *ptr = (uint8_t*)&n;
+    f64 = (f64 << 8) | ptr[7];
+    f64 = (f64 << 8) | ptr[6];
+    f64 = (f64 << 8) | ptr[5];
+    f64 = (f64 << 8) | ptr[4];
+    f64 = (f64 << 8) | ptr[3];
+    f64 = (f64 << 8) | ptr[2];
+    f64 = (f64 << 8) | ptr[1];
+    f64 = (f64 << 8) | ptr[0];
+    return float64_to_float128(f64);
+}
+
+#include <math.h>
+#define _native_cos(a) cos(a)
+#define _native_acos(a) acos(a)
+#define _native_cosh(a) cosh(a)
+#define _native_sin(a) sin(a)
+#define _native_asin(a) asin(a)
+#define _native_sinh(a) sinh(a)
+#define _native_tan(a) tan(a)
+#define _native_atan(a) atan(a)
+#define _native_tanh(a) tanh(a)
+#define _native_atanh(a) atanh(a)
+#define _native_pow(a, b) pow((a), (b))
+#define _native_exp(a) exp(a)
+#define _native_expm1(a) (exp(a) - 1.0) /* or expm1() */
+#define _native_log10(a) log10(a)
+#define _native_log2(a) (log(a) / log(2.0)) /* or log2() */
+#define _native_log(a) log(a)
+#define _native_log1p(a) log((a) + 1.0) /* or log1p() */
+
+const double _native_e = 2.71828182845904509;
+const double _native_10 = 10.0;
+const double _native_2 = 2.0;
+const double _native_1 = 1.0;
+
+#elif (defined(HACKY_MATH_PPC))
+#error "PowerPC hacky math isn't implemented yet"
+#else
+#error "You need to define HACKY_MATH_X86, or implement one for your arch"
+#endif
+
+static float128 _hack_cos (float128 x) {
+    return _from_native(_native_cos(_to_native(x)));
+}
+
+static float128 _hack_acos (float128 x) {
+    return _from_native(_native_acos(_to_native(x)));
+}
+
+static float128 _hack_cosh (float128 x) {
+    return _from_native(_native_cosh(_to_native(x)));
+}
+
+static float128 _hack_sin (float128 x) {
+    return _from_native(_native_sin(_to_native(x)));
+}
+
+static float128 _hack_asin (float128 x) {
+    return _from_native(_native_asin(_to_native(x)));
+}
+
+static float128 _hack_sinh (float128 x) {
+    return _from_native(_native_sinh(_to_native(x)));
+}
+
+static float128 _hack_tan (float128 x) {
+    return _from_native(_native_tan(_to_native(x)));
+}
+
+static float128 _hack_atan (float128 x) {
+    return _from_native(_native_atan(_to_native(x)));
+}
+
+static float128 _hack_tanh (float128 x) {
+    return _from_native(_native_tanh(_to_native(x)));
+}
+
+static float128 _hack_atanh (float128 x) {
+    return _from_native(_native_atanh(_to_native(x)));
+}
+
+static float128 _hack_etox (float128 x) {
+    return _from_native(_native_exp(_to_native(x)));
+}
+
+static float128 _hack_etoxm1 (float128 x) {
+    return _from_native(_native_expm1(_to_native(x)));
+}
+
+static float128 _hack_log10 (float128 x) {
+    return _from_native(_native_log10(_to_native(x)));
+}
+
+static float128 _hack_log2 (float128 x) {
+    return _from_native(_native_log2(_to_native(x)));
+}
+
+static float128 _hack_logn (float128 x) {
+    return _from_native(_native_log(_to_native(x)));
+}
+
+static float128 _hack_lognp1 (float128 x) {
+    return _from_native(_native_log1p(_to_native(x)));
+}
+
+static float128 _hack_tentox (float128 x) {
+    return _from_native(_native_pow(_native_10, _to_native(x)));
+}
+
+static float128 _hack_twotox (float128 x) {
+    return _from_native(_native_pow(_native_2, _to_native(x)));
+}
+
+#pragma mark FMATH! and all its helpers
+
+/* Function prototypes from SoftFloat/softfloat-specialize.h */
+char float128_is_nan(float128 a);
+char float128_is_signaling_nan (float128 a);
+
+
+static void inst_fmath_fmovecr (void)
+{
+    fpu_get_state_ptr();
+
+    /*
+     * FYI: these constants are stored in the "intermediate" 85-bit
+     *      format in the 6888x rom. This has the side effect that
+     *      they are rounded according to fpcr.mc_rnd.
+     *      We emulate the intermediate 85-bit format with float128.
+     */
+    
+    switch (fpu->fmath_op) {
+        case 0x00: // pi
+            fpu->result = _assemble_float128(0, 0x4000, 0x921fb54442d1, 0x8469898cc51701b8);
+            break;
+        case 0x0b: // log_10(2)
+            fpu->result = _assemble_float128(0, 0x3ffd, 0x34413509f79f, 0xef311f12b35816f9);
+            break;
+        case 0x0c: // e
+            fpu->result = _assemble_float128(0, 0x4000, 0x5bf0a8b14576, 0x95355fb8ac404e7a);
+            break;
+        case 0x0d: // log_2(e)
+            fpu->result = _assemble_float128(0, 0x3fff, 0x71547652b82f, 0xe1777d0ffda0d23a);
+            break;
+        case 0x0e: // log_10(e)
+            // NOTE: 68881 doesn't set inex2 for this one
+            // Also note: that's bogus. 68881 uses 3 trailing mantissa bits to do rounding,
+            // and those bits are non-zero for this number, so it must actually be stored
+            // incorrectly in the ROM.
+            // I'll emulate this by truncating the float128 mantissa.
+            
+            // fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a6ab7555f5a67);
+            fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a000000000000);
+            break;
+        case 0x0f: // 0.0
+            fpu->result = _assemble_float128(0, 0, 0, 0);
+            break;
+        case 0x30: // ln(2)
+            fpu->result = _assemble_float128(0, 0x3ffe, 0x62e42fefa39e, 0xf35793c7673007e5);
+            break;
+        case 0x31: // ln(10)
+            fpu->result = _assemble_float128(0, 0x4000, 0x26bb1bbb5551, 0x582dd4adac5705a6);
+            break;
+        case 0x32: // 1 (68kprm has typesetting issues everywhere. This one says 100, but means 10^0.)
+            fpu->result = _assemble_float128(0, 0x3fff, 0x0, 0x0);
+            break;
+        case 0x33: // 10
+            fpu->result = _assemble_float128(0, 0x4002, 0x400000000000, 0x0);
+            break;
+        case 0x34: // 10^2
+            fpu->result = _assemble_float128(0, 0x4005, 0x900000000000, 0x0);
+            break;
+        case 0x35: // 10^4
+            fpu->result = _assemble_float128(0, 0x400c, 0x388000000000, 0x0);
+            break;
+        case 0x36: // 10^8
+            fpu->result = _assemble_float128(0, 0x4019, 0x7d7840000000, 0x0);
+            break;
+        case 0x37: // 10^16
+            fpu->result = _assemble_float128(0, 0x4034, 0x1c37937e0800, 0x0);
+            break;
+        case 0x38: // 10^32
+            fpu->result = _assemble_float128(0, 0x4069, 0x3b8b5b5056e1, 0x6b3be04000000000);
+            break;
+        case 0x39: // 10^64
+            fpu->result = _assemble_float128(0, 0x40d3, 0x84f03e93ff9f, 0x4daa797ed6e38ed6);
+            break;
+        case 0x3a: // 10^128
+            fpu->result = _assemble_float128(0, 0x41a8, 0x27748f9301d3, 0x19bf8cde66d86d62);
+            break;
+        case 0x3b: // 10^256
+            fpu->result = _assemble_float128(0, 0x4351, 0x54fdd7f73bf3, 0xbd1bbb77203731fd);
+            break;
+        case 0x3c: // 10^512
+            fpu->result = _assemble_float128(0, 0x46a3, 0xc633415d4c1d, 0x238d98cab8a978a0);
+            break;
+        case 0x3d: // 10^1024
+            fpu->result = _assemble_float128(0, 0x4d48, 0x92eceb0d02ea, 0x182eca1a7a51e316);
+            break;
+        case 0x3e: // 10^2048
+            fpu->result = _assemble_float128(0, 0x5a92, 0x3d1676bb8a7a, 0xbbc94e9a519c6535);
+            break;
+        case 0x3f: // 10^4096
+            fpu->result = _assemble_float128(0, 0x7525, 0x88c0a4051441, 0x2f3592982a7f0094);
+            break;
+        default:
+            /*
+             * I wanted to include the actual values for the other ROM offsets,
+             * but they might be proprietary. Most of them are 0 anyways, and some
+             * cause FPU exceptions, even with all exceptions disabled... (?)
+             * 68040 FPSP just returns 0, so we'll do that too.
+             */
+            fpu->result = _assemble_float128(0, 0, 0, 0);
+            return ;
+    }
+}
+
+/*
+ * This is quick macro.pl macro to build a jump table
+ * for fmath instructions. It is probably slightly slower 
+ * to use a jump table rather than a big switch statement,
+ * but I think it looks cleaner.
+ */
+~newmacro(create_fmath_jump_table, 0, {
+    my $name_map = {};
+    my $op_map = {};
+    my $add = sub {
+        my $op = shift;
+        my $name = lc(shift);
+        my $mode = 'foo';
+        my $arch = 68881;
+        
+        foreach my $arg (@_) {
+            if (($arg eq 'monadic') or ($arg eq 'dyadic')) {
+                $mode = $arg;
+            }
+            elsif ($arg == 68040) {
+                $arch = $arg;
+            }
+            else {
+                croak("bad arg $arg");
+            }
+        }
+        
+        croak("didn't specify mode") if ($mode eq "foo");
+        croak("dup $op $name") if exists $op_map->{$op};
+        croak("bogus") if ($op > 127);
+        
+        $op_map->{$op} = {op => $op, name => $name, mode => $mode, arch => $arch};
+        $name_map->{$name} = $op_map->{$op};
+    };
+    
+    $add->(~b(1000000), 'fmove',    'monadic', 68040);
+    $add->(~b(1000100), 'fmove',    'monadic', 68040);
+    $add->(~b(0000000), 'fmove',    'monadic');
+    
+    $add->(~b(0000001), 'fint',     'monadic');
+    $add->(~b(0000010), 'fsinh',    'monadic');
+    $add->(~b(0000011), 'fintrz',   'monadic');
+    
+    $add->(~b(1000001), 'fsqrt',    'monadic', 68040);
+    $add->(~b(1000101), 'fsqrt',    'monadic', 68040);
+    $add->(~b(0000100), 'fsqrt',    'monadic');
+    
+    $add->(~b(0000110), 'flognp1',  'monadic');
+    $add->(~b(0001000), 'fetoxm1',  'monadic');
+    $add->(~b(0001001), 'ftanh',    'monadic');
+    $add->(~b(0001010), 'fatan',    'monadic');
+    $add->(~b(0001100), 'fasin',    'monadic');
+    $add->(~b(0001101), 'fatanh',   'monadic');
+    $add->(~b(0001110), 'fsin',     'monadic');
+    $add->(~b(0001111), 'ftan',     'monadic');
+    $add->(~b(0010000), 'fetox',    'monadic');
+    $add->(~b(0010001), 'ftwotox',  'monadic');
+    $add->(~b(0010010), 'ftentox',  'monadic');
+    $add->(~b(0010100), 'flogn',    'monadic');
+    $add->(~b(0010101), 'flog10',   'monadic');
+    $add->(~b(0010110), 'flog2',    'monadic');
+    
+    $add->(~b(1011000), 'fabs',     'monadic', 68040);
+    $add->(~b(1011100), 'fabs',     'monadic', 68040);
+    $add->(~b(0011000), 'fabs',     'monadic');
+    
+    $add->(~b(0011001), 'fcosh',    'monadic');
+    
+    $add->(~b(1011010), 'fneg',     'monadic', 68040);
+    $add->(~b(1011110), 'fneg',     'monadic', 68040);
+    $add->(~b(0011010), 'fneg',     'monadic');
+    
+    $add->(~b(0011100), 'facos',    'monadic');
+    $add->(~b(0011101), 'fcos',     'monadic');
+    $add->(~b(0011110), 'fgetexp',  'monadic');
+    $add->(~b(0011111), 'fgetman',  'monadic');
+    
+    $add->(~b(1100000), 'fdiv',     'dyadic', 68040);
+    $add->(~b(1100100), 'fdiv',     'dyadic', 68040);
+    $add->(~b(0100000), 'fdiv',     'dyadic');
+    
+    $add->(~b(0100001), 'fmod',     'dyadic');
+    
+    $add->(~b(1100010), 'fadd',     'dyadic', 68040);
+    $add->(~b(1100110), 'fadd',     'dyadic', 68040);
+    $add->(~b(0100010), 'fadd',     'dyadic');
+    
+    $add->(~b(1100011), 'fmul',     'dyadic', 68040);
+    $add->(~b(1100111), 'fmul',     'dyadic', 68040);
+    $add->(~b(0100011), 'fmul',     'dyadic');
+    
+    $add->(~b(0100100), 'fsgldiv',  'dyadic');
+    $add->(~b(0100101), 'frem',     'dyadic');
+    $add->(~b(0100110), 'fscale',   'dyadic');
+    $add->(~b(0100111), 'fsglmul',  'dyadic');
+    
+    $add->(~b(1101000), 'fsub',     'dyadic', 68040);
+    $add->(~b(1101100), 'fsub',     'dyadic', 68040);
+    $add->(~b(0101000), 'fsub',     'dyadic');
+    
+    $add->(~b(0110000), 'fsincos',  'monadic');
+    $add->(~b(0110001), 'fsincos',  'monadic');
+    $add->(~b(0110010), 'fsincos',  'monadic');
+    $add->(~b(0110011), 'fsincos',  'monadic');
+    $add->(~b(0110100), 'fsincos',  'monadic');
+    $add->(~b(0110101), 'fsincos',  'monadic');
+    $add->(~b(0110110), 'fsincos',  'monadic');
+    $add->(~b(0110111), 'fsincos',  'monadic');
+    
+    
+    $add->(~b(0111000), 'fcmp',     'dyadic');
+    $add->(~b(0111010), 'ftst',     'monadic');
+    
+    my $map_str = "fmath_impl_t *_fmath_map[128] = {\n";
+    my @inst_flags = (0) x 128;
+    
+    for (my $i=0; $i < 128; $i++) {
+        my $func_ptr = "NULL";
+        if (exists $op_map->{$i}) {
+            $func_ptr = 'inst_fmath_' . $op_map->{$i}->{name};
+            if ($op_map->{$i}->{mode} eq 'dyadic') {
+                $inst_flags[$i] |= 1;
+            }
+            if ($op_map->{$i}->{arch} == 68040) {
+                $inst_flags[$i] |= 2;
+            }
+        }
+
+        $map_str .= "\t" . $func_ptr . ",\n";
+    }
+    $map_str .= "};\n\nuint8_t _fmath_flags[128] = {\n";
+    
+    for (my $i=0; $i < 128; $i++) {
+        $map_str .= "\t" . sprintf('0x%02x', $inst_flags[$i]) . ",\n";
+    }
+    $map_str .= "};\n";
+    
+    $map_str .= "const char *_fmath_names[128] = {\n";
+    for (my $i=0; $i < 128; $i++) {
+        my $name = "f???";
+        if (exists $op_map->{$i}) {
+            $name = $op_map->{$i}->{name};
+        }
+        $map_str .= "\t\"" . $name . "\",\n";
+    }
+    $map_str .= "};\n";
+    
+    return $map_str;
+})
+
+static _Bool _float128_is_zero (float128 f)
+{
+    return ((f.high << 1) == 0) && (f.low == 0);
+}
+
+static _Bool _float128_is_neg (float128 f)
+{
+    return f.high >> 63;
+}
+
+static _Bool _float128_is_infinity (float128 f)
+{
+    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
+    const uint64_t frac_b = f.low;
+    const uint16_t exp = (f.high >> 48) & 0x7fff;
+    
+    return (exp == 0x7fff) && ((frac_a | frac_b) == 0);
+}
+
+static _Bool _float128_is_nan (float128 f)
+{
+    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
+    const uint64_t frac_b = f.low;
+    const uint16_t exp = (f.high >> 48) & 0x7fff;
+    
+    return (exp == 0x7fff) && ((frac_a | frac_b) != 0);
+}
+
+const float128 _nan128 = {
+    .high = 0xFFFF800000000000ULL,
+    .low = 0
+};
+
+const float128 _one128 = {
+    .high = 0x3fff000000000000ULL,
+    .low = 0
+};
+
+const float128 _zero128 = {
+    .high = 0,
+    .low = 0
+};
+
+static void inst_fmath_fabs ()
+{
+    fpu_get_state_ptr();
+    
+    /* Clear the sign bit */
+    fpu->result = fpu->source;
+    fpu->result.high <<= 1;
+    fpu->result.high >>= 1;
+}
+
+static void inst_fmath_facos ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* Find the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is zero, result is +pi/2 */
+    if (source_zero) {
+        fpu->result = _assemble_float128(0, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
+        return;
+    }
+    /* If source isn't in range [-1, 1], return nan, set operr */
+    else if (!float128_le(tmp, _one128)) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_acos(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fadd ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_add(fpu->dest, fpu->source);
+    
+    /* 
+     * Throw operr (and return NaN) if operands are infinities
+     * with opposite signs. (I *think* softfloat is doing this
+     * corectly - the code's hard to read.)
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fasin ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* Find the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source isn't in range [-1, 1], return nan, set operr */
+    else if (!float128_le(tmp, _one128)) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_asin(fpu->source);
+    /* Set inex2?? */
+    /* Set unfl?? */
+}
+
+static void inst_fmath_fatan ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is +-pi/2 */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(source_sign, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
+        return ;
+    }
+    
+    fpu->result = _hack_atan(fpu->source);
+    /* Set inex2?? */
+    /* Set unfl?? */
+}
+
+static void inst_fmath_fatanh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* Take the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is 0, return source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If |source| == 1.0, set dz, return +-inf */
+    else if (float128_eq(tmp, _one128)) {
+        es_dz = 1;
+        fpu->result = _assemble_float128(source_sign, 0x7fff, 0, 0);
+        return;
+    }
+    /* If |source| > 1.0, set operr, return nan */
+    else if (!float128_le(tmp, _one128)) {
+        es_operr = 1;
+        fpu->result = _nan128;
+        return ;
+    }
+    
+    fpu->result = _hack_atanh(fpu->source);
+}
+
+static void inst_fmath_fcmp ()
+{
+    fpu_get_state_ptr();
+    
+    /* Don't write the result back to the register */
+    fpu->write_back = 0;
+    
+    fpu->result = float128_sub(fpu->dest, fpu->source);
+    
+    /*
+     * The 68881 docs say fcmp doesn't throw any exceptions
+     * based on the result, but I'm not sure I believe it.
+     
+     if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+     
+     if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+     */
+}
+
+static void inst_fmath_fcos ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_cos(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fcosh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return;
+    }
+    /* If source is +/- inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(0, 0x7fff, 0, 0);
+        return;
+    }
+    
+    fpu->result = _hack_cosh(fpu->source);
+}
+
+static void inst_fmath_fdiv ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_div(fpu->dest, fpu->source);
+    
+    /* Throw operr (and return NaN) if both operands are zero */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw divide-by-zero if dividend is zero */
+    if (float_exception_flags & float_flag_divbyzero)
+        es_dz = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fetox ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = _zero128;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_etox(fpu->source);
+}
+
+static void inst_fmath_fetoxm1 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = negone;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_etoxm1(fpu->source);
+}
+
+static void inst_fmath_fgetexp ()
+{
+    fpu_get_state_ptr();
+    
+    /* If source is INF, set operr and return NaN */
+    if (((fpu->source.high << 1) == 0xfffe000000000000ULL) && (fpu->source.low == 0)) {
+        es_operr = 1;
+        fpu->result.high = 0xffff000000000000ULL;
+        fpu->result.low = 0xc000000000000000ULL;
+        return ;
+    }
+    
+    /*
+     * If source is 0, return source.
+     * According to 68881 docs, the result needs to have 
+     * the same sign as the source (why?)
+     */
+    if (((fpu->source.high << 1) == 0) && (fpu->source.low == 0)) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    /*
+     * Otherwise, extract the biased exponent, convert it
+     * to a two's complement integer, and store that value
+     * as a float.
+     */
+    const uint32_t biased = (fpu->source.high << 1) >> 49;
+    fpu->result = int32_to_float128(((int32_t)biased) - 16383);
+}
+
+static void inst_fmath_fgetman ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fgetman not implemented");
+}
+
+static void inst_fmath_fint ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_round_to_int(fpu->source);
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+}
+
+static void inst_fmath_fintrz ()
+{
+    fpu_get_state_ptr();
+    
+    /* Same as fint, but force the round-to-zero mode */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    float_rounding_mode = float_round_to_zero;
+    fpu->result = float128_round_to_int(fpu->source);
+    float_rounding_mode = old_round_mode;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+}
+
+static void inst_fmath_flog10 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_log10(fpu->source);
+}
+
+static void inst_fmath_flog2 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_log2(fpu->source);
+}
+
+static void inst_fmath_flognp1 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is -1.0, set dz, result is -inf */
+    else if (float128_eq(negone, fpu->source)) {
+        es_dz = 1;
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        return;
+    }
+    /* If source < -1.0, set operr, result is nan */
+    else if (float128_lt(fpu->source, negone)) {
+        es_operr = 1;
+        fpu->result = _nan128;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_lognp1(fpu->source);
+}
+
+static void inst_fmath_flogn ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_logn(fpu->source);
+}
+
+static void inst_fmath_fmove ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = fpu->source;
+}
+
+static void inst_fmath_fmul ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_mul(fpu->dest, fpu->source);
+    
+    /*
+     * Throw operr (and return NaN) if one operand is infinity
+     * and the other is zero.
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fneg ()
+{
+    fpu_get_state_ptr();
+    
+    /* Flip the sign bit */
+    fpu->result = fpu->source;
+    fpu->result.high ^= (1ULL << 63);
+    
+    /* 
+     * FIXME: you're supposed to throw UNFL if this is a
+     *        denormalized number, I think.
+     */
+}
+
+static void inst_fmath_frem ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool dest_zero = _float128_is_zero(fpu->dest);
+    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
+    
+    /* I just assume the quotient/sign are 0 for the following cases */
+    qu_quotient = 0;
+    qu_s = 0;
+    
+    /* If source is zero, result is nan */
+    if (source_zero) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If dest (but not source) is zero, result is that zero */
+    else if (dest_zero) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    /* If dest is infinity, result is nan */
+    else if (dest_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If source, but not dest, is infinity, result is dest */
+    else if (source_inf) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    
+    /* -- We're past the edge cases, do the actual op -- */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    
+    /* frem uses round-to-nearest */
+    float_rounding_mode = float_round_nearest_even;
+    
+    float128 N = float128_div(fpu->dest, fpu->source);
+    N = float128_round_to_int(N);
+    
+    float_rounding_mode = old_round_mode;
+    
+    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
+    
+    /* FIXME: not sure how to set unfl reliably */
+    
+    _Bool sign = N.high >> 63; /* Remember the sign */
+    N.high <<= 1; /* Clear the sign */
+    N.high >>= 1;
+    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
+    qu_quotient = final & 0x7f;
+    qu_s = sign;
+}
+
+static void inst_fmath_fmod ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool dest_zero = _float128_is_zero(fpu->dest);
+    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
+    
+    /* I just assume the quotient/sign are 0 for the following cases */
+    qu_quotient = 0;
+    qu_s = 0;
+    
+    /* If source is zero, result is nan */
+    if (source_zero) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If dest (but not source) is zero, result is that zero */
+    else if (dest_zero) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    /* If dest is infinity, result is nan */
+    else if (dest_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If source, but not dest, is infinity, result is dest */
+    else if (source_inf) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    
+    /* -- We're past the edge cases, do the actual op -- */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    
+    /* fmod uses round-to-zero */
+    float_rounding_mode = float_round_to_zero;
+    
+    float128 N = float128_div(fpu->dest, fpu->source);
+    N = float128_round_to_int(N);
+    
+    float_rounding_mode = old_round_mode;
+    
+    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
+    
+    /* FIXME: not sure how to set unfl reliably */
+    
+    _Bool sign = N.high >> 63; /* Remember the sign */
+    N.high <<= 1; /* Clear the sign */
+    N.high >>= 1;
+    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
+    qu_quotient = final & 0x7f;
+    qu_s = sign;
+}
+
+static void inst_fmath_fscale ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fscale not implemented");
+}
+
+static void inst_fmath_fsgldiv ()
+{
+    fpu_get_state_ptr();
+    
+    float128 source = fpu->source;
+    float128 dest = fpu->dest;
+    
+    /* Dump the low 88 bits of the source/dest mantissas */
+    source.low = 0;
+    source.high &= 0xffffffffff000000;
+    dest.low = 0;
+    dest.high &= 0xffffffffff000000;
+    
+    fpu->result = float128_div(dest, source);
+    
+    /* Throw operr (and return NaN) if both operands are zero */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw divide-by-zero if dividend is zero */
+    if (float_exception_flags & float_flag_divbyzero)
+        es_dz = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fsglmul ()
+{
+    fpu_get_state_ptr();
+    
+    /*
+     * As far as I can tell, fsglmul/fsgldiv use an ALU
+     * for the mantissa that is only 24-bits wide. Everything
+     * else is done with regular internal precision.
+     */
+    
+    float128 source = fpu->source;
+    float128 dest = fpu->dest;
+    
+    /* Dump the low 88 bits of the source/dest mantissas */
+    source.low = 0;
+    source.high &= 0xffffffffff000000;
+    dest.low = 0;
+    dest.high &= 0xffffffffff000000;
+    
+    fpu->result = float128_mul(dest, source);
+    
+    /*
+     * Throw operr (and return NaN) if one operand is infinity
+     * and the other is zero.
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fsin ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_sin(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fsincos ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fsincos not implemented");
+}
+
+static void inst_fmath_fsinh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero or inf, return source */
+    if (source_zero || source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_sinh(fpu->source);
+}
+
+static void inst_fmath_fsqrt ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_sqrt(fpu->source);
+    
+    /* Throw operr (and return NaN) if the operand is < 0 */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+}
+
+static void inst_fmath_fsub ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_sub(fpu->dest, fpu->source);
+    
+    /*
+     * Throw operr (and return NaN) if operands are infinities
+     * with equal signs. (I *think* softfloat is doing this
+     * corectly - the code's hard to read.)
+     *
+     * Both 68kprm and 68881 docs say that (+inf) - (-inf) = (-inf)
+     * but I presume that's a typo, and it's supposed to be (+inf)
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_ftan ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_tan(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_ftanh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is +/- inf, result is +/- 1.0 */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(source_sign, 0x3fff, 0, 0);
+        return;
+    }
+    
+    fpu->result = _hack_tanh(fpu->source);
+}
+
+static void inst_fmath_ftentox ()
+{
+    fpu_get_state_ptr();
+    
+    // _hack_ftentox() is broken on clang 3.5 on osx 10.10
+    // (tries to optimize pow(10.0, x) to __exp10(x), and __exp10
+    //  isn't implemented in the 10.8 SDK)
+//    const _Bool source_zero = _float128_is_zero(fpu->source);
+//    const _Bool source_inf = _float128_is_infinity(fpu->source);
+//    const _Bool source_sign = _float128_is_neg(fpu->source);
+//    
+//    /* If source is zero, result is +1.0 */
+//    if (source_zero) {
+//        fpu->result = _one128;
+//        return ;
+//    }
+//    /* if source is -inf, result is +0.0 */
+//    else if (source_inf && source_sign) {
+//        fpu->result = _zero128;
+//        return ;
+//    }
+//    /* if source is +inf, result is +inf */
+//    else if (source_inf) {
+//        fpu->result = fpu->source;
+//        return ;
+//    }
+//    
+//    fpu->result = _hack_tentox(fpu->source);
+//    
+    assert(!"fmath: ftentox not implemented");
+}
+
+static void inst_fmath_ftst ()
+{
+    fpu_get_state_ptr();
+    
+    /* Don't write the result back to the register */
+    fpu->write_back = 0;
+    
+    /* ftst just sets the cond codes according to the source */
+    fpu->result = fpu->source;
+}
+
+static void inst_fmath_ftwotox ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = _zero128;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_twotox(fpu->source);
+}
+
+typedef void (fmath_impl_t)(void);
+#define FMATH_TYPE_DYADIC 1
+#define FMATH_TYPE_68040 2
+~create_fmath_jump_table()
+
+
+/*
+ * Take fpu->result, and round and crop it to the
+ * preferred precision, then return the result as
+ * a floatx80. (Set all the appropriate exception bits
+ * too)
+ *
+ * ALSO!! This checks and sets underflow/overflow
+ */
+static floatx80 _fmath_round_intermediate_result ()
+{
+    fpu_get_state_ptr();
+    floatx80 final;
+    
+    float_exception_flags = 0; // (so we can know if the result is inexact)
+    _set_rounding_mode(mc_rnd); // Set the preferred rounding mode
+    
+    if (mc_prec == prec_extended) { // extended precision
+        final = float128_to_floatx80(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+    }
+    else if (mc_prec == prec_double) { // double precision
+        float64 tmp = float128_to_float64(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+        final = float64_to_floatx80(tmp);
+    }
+    else if (mc_prec == prec_single) { // single precision
+        float32 tmp = float128_to_float32(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+        final = float32_to_floatx80(tmp);
+    }
+    else
+        assert(!"bogus precision mode???");
+    
+    return final;
+}
+
+static void _fmath_set_condition_codes (floatx80 val)
+{
+    fpu_get_state_ptr();
+    const uint64_t frac = val.low;
+    const uint32_t exp = val.high & 0x7fff;
+    const _Bool sign = val.high >> 15;
+    
+    /* Clear the whole CC register byte */
+    fpu->fpsr.raw &= 0x00ffffff;
+    
+    /* Check for zero */
+    cc_z = ((exp == 0) && (frac == 0));
+    
+    /* Check for negative */
+    cc_n = sign;
+    
+    /* Check for NaN */
+    cc_nan = ((exp == 0x7fff) && ((frac << 1) != 0));
+    
+    /* Check for infinity */
+    cc_i = ((exp == 0x7fff) && ((frac << 1) == 0));
+}
+
+static void _fmath_handle_nans ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool is_dyadic = _fmath_flags[fpu->fmath_op] & FMATH_TYPE_DYADIC;
+    const _Bool is_signaling = float128_is_signaling_nan(fpu->source) ||
+                                (is_dyadic && float128_is_signaling_nan(fpu->dest));
+    const _Bool is_source_nan = float128_is_nan(fpu->source);
+    const _Bool is_dest_nan = is_dyadic && float128_is_nan(fpu->dest);
+    
+    /*
+     * If the dest is NaN, or both are NaN, let the result be set to dest.
+     * (with signaling disabled)
+     */
+    if (is_dest_nan)
+        fpu->result = fpu->dest;
+    else {
+        assert(is_source_nan);
+        fpu->result = fpu->source;
+    }
+    
+    /* Set the snan exception status bit */
+    es_snan = is_signaling;
+    
+    /* Silence the result */
+    // Signaling -> 0
+    // Non-signaling -> 1
+    fpu->result.high |= 0x800000000000;
+}
+
+void dis_fmath (uint16_t op, uint16_t ext, char *output)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    /* If this is fmovecr */
+    if (src_in_ea && (source_specifier == 7)) {
+        const char *name = NULL;
+        switch (extension) {
+            case 0x00: name = "pi"; break;
+            case 0x0b: name = "log_10(2)"; break;
+            case 0x0c: name = "c"; break;
+            case 0x0d: name = "log_2(e)"; break;
+            case 0x0e: name = "log_10(e)"; break;
+            case 0x0f: name = "0.0"; break;
+            case 0x30: name = "ln(2)"; break;
+            case 0x31: name = "ln(10)"; break;
+            case 0x32: name = "1.0"; break;
+            case 0x33: name = "10.0"; break;
+            case 0x34: name = "10.0^2"; break;
+            case 0x35: name = "10.0^4"; break;
+            case 0x36: name = "10.0^8"; break;
+            case 0x37: name = "10.0^16"; break;
+            case 0x38: name = "10.0^32"; break;
+            case 0x39: name = "10.0^64"; break;
+            case 0x3a: name = "10.0^128"; break;
+            case 0x3b: name = "10.0^256"; break;
+            case 0x3c: name = "10.0^512"; break;
+            case 0x3d: name = "10.0^1024"; break;
+            case 0x3e: name = "10.0^2048"; break;
+            case 0x3f: name = "10.0^4096"; break;
+        }
+        if (name == NULL)
+            sprintf(output, "fmovecr.x #%u,fp%u", extension, dest_register);
+        else
+            sprintf(output, "fmovecr.x %s,fp%u", name, dest_register);
+        return;
+    }
+    
+    if (_fmath_map[e] == NULL) {
+            /* This instruction isn't defined */
+            sprintf(output, "fmath???");
+    }
+    else if (_fmath_map[e] == inst_fmath_fsincos) {
+        /* fsincos.<fmt> <ea>,FPc:FPs */
+        if (src_in_ea)
+            sprintf(output, "fsincos.%c %s,fp%u:fp%u", "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]), e & 7, dest_register);
+        else
+            sprintf(output, "fsincos.x fp%u,fp%u:fp%u", source_specifier, e & 7, dest_register);
+    }
+    else if (_fmath_map[e] == inst_fmath_ftst) {
+        /* ftst.<fmt> <source> */
+        if (src_in_ea)
+            sprintf(output, "ftst.%c %s", "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]));
+        else
+            sprintf(output, "ftst.x fp%u", dest_register);
+    }
+    else {
+        /* f<inst>.<fmt> <source>,<dest> */
+        if (src_in_ea)
+            sprintf(output, "%s.%c %s,fp%u", _fmath_names[e], "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]), dest_register);
+        else
+            sprintf(output, "%s.x fp%u,fp%u", _fmath_names[e],
+                    source_specifier, dest_register);
+    }
+}
+
+static long double _float128_to_long_double(float128 f128)
+{
+    long double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    
+    int8_t old = float_exception_flags;
+    floatx80 f80 = float128_to_floatx80(f128);
+    float_exception_flags = old;
+    
+    ptr[9] = (f80.high >> 8) & 0xff;
+    ptr[8] = (f80.high >> 0) & 0xff;
+    ptr[7] = (f80.low >> 56) & 0xff;
+    ptr[6] = (f80.low >> 48) & 0xff;
+    ptr[5] = (f80.low >> 40) & 0xff;
+    ptr[4] = (f80.low >> 32) & 0xff;
+    ptr[3] = (f80.low >> 24) & 0xff;
+    ptr[2] = (f80.low >> 16) & 0xff;
+    ptr[1] = (f80.low >> 8) & 0xff;
+    ptr[0] = (f80.low >> 0) & 0xff;
+    
+    return result;
+}
+
+static long double _floatx80_to_long_double(floatx80 f80)
+{
+    long double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    
+    ptr[9] = (f80.high >> 8) & 0xff;
+    ptr[8] = (f80.high >> 0) & 0xff;
+    ptr[7] = (f80.low >> 56) & 0xff;
+    ptr[6] = (f80.low >> 48) & 0xff;
+    ptr[5] = (f80.low >> 40) & 0xff;
+    ptr[4] = (f80.low >> 32) & 0xff;
+    ptr[3] = (f80.low >> 24) & 0xff;
+    ptr[2] = (f80.low >> 16) & 0xff;
+    ptr[1] = (f80.low >> 8) & 0xff;
+    ptr[0] = (f80.low >> 0) & 0xff;
+    
+    return result;
+}
+
+static void inst_fmath (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    floatx80 rounded_result;
+    
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    slog("FPU:---\n");
+    
+    /* Throw illegal instruction for 040-only ops */
+    if (_fmath_flags[e] & FMATH_TYPE_68040) {
+        _throw_illegal_instruction();
+        return;
+    }
+    
+    /*
+     * All the documented fmath ops have an implementation in
+     * _fmath_map[]. If it's NULL, it's not documented; throw
+     * an exception.
+     * This probably matches what the 68040's behavior (I haven't
+     * checked), but the 68881 doesn't do this.
+     * 68881 throws an illegal instruction exception for all
+     * opcodes where the high (6th) bit of e is set.
+     * All other instructions seem to short circuit to the 
+     * nearest documented instruction.
+     * FIXME: consider implementing this behavior.
+     */
+    if (_fmath_map[e] == NULL) {
+        /* Unless this is fmovecr, where the extension doesn't matter */
+        if (!(src_in_ea && (source_specifier == 7))) {
+            _throw_illegal_instruction();
+            return ;
+        }
+    }
+    
+    /* We only need to load the dest reg for dyadic ops */
+    if (_fmath_flags[e] & FMATH_TYPE_DYADIC)
+        fpu->dest = floatx80_to_float128(fpu->fp[dest_register]);
+    
+    /*
+     * We'll shrink the precision and perform rounding
+     * just prior to writing back the result.
+     * Certain instructions override the precision
+     * in fpcr, so keep track of the prefered prec here.
+     */
+    enum rounding_precision_t rounding_prec = mc_prec;
+    
+    /*
+     * For all the intermediate calculations, we
+     * probably want to use nearest-rounding mode.
+     */
+    _set_rounding_mode(mode_nearest);
+    
+    /* Reset softfloat's exception flags */
+    float_exception_flags = 0;
+    
+    /* Reset fpsr's exception flags */
+    es_inex1 = 0; // this is only set for imprecisely-rounded packed inputs (not implemented)
+    es_inex2 = 0; // set if we ever lose precision (during the op or during rounding)
+    es_dz = 0;    // set if we divided by zero
+    es_unfl = 0;  // set if we underflowed (inex2 should be set too, I think)
+    es_ovfl = 0;  // set if we overflowed (inex2 should be set too, I think)
+    es_operr = 0; // set if there was an instruction specific operand error
+    es_snan = 0;  // Set if one of the inputs was a signaling NaN
+    es_bsun = 0;  // never set here
+    
+    fpu->write_back = 1; // let "do-write-back" be the default behavior
+    fpu->fmath_op = e;
+    
+    /* Handle fmovecr */
+    if (src_in_ea && (source_specifier == 7)) { // fmovecr
+        /* 
+         * 68kprm says M should be ~b(000000), but apparently
+         * any value will work for fmovecr
+         */
+        slog("FPU: fmovecr %u,fp%u\n", e, dest_register);
+        inst_fmath_fmovecr();
+        goto computation_done;
+    }
+    
+    /*
+     * Read in the source from the EA or from a register.
+     * In either case, convert the value to a float128,
+     * (that's our version of the 85-bit "intermediate" format)
+     */
+    if (src_in_ea) {
+        if (!_fpu_read_ea(source_specifier, &fpu->source))
+            return ;
+        slog("FPU: %s.%c ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
+    }
+    else {
+        fpu->source = floatx80_to_float128(fpu->fp[source_specifier]);
+        slog("FPU: %s.x ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
+    }
+
+    
+    {
+        long double tmp = _float128_to_long_double(fpu->source);
+        printf("%Lf,fp%u\n", tmp, dest_register);
+    }
+    
+    /*
+     * If the source is NaN, or this is a dyadic (two-operand)
+     * instruction, and the second operand (fpu->dest) is NaN,
+     * then the result is predetermined: NaN
+     */
+    if (float128_is_nan(fpu->source) ||
+             (((_fmath_flags[e] & FMATH_TYPE_DYADIC) &&
+               float128_is_nan(fpu->dest)))) {
+        _fmath_handle_nans();
+        goto computation_done;
+    }
+    
+    /* 
+     * Otherwise, call the extension-specific helper function.
+     * Guarantees: Neither source nor dest are NaN
+     *             SoftFloat's exception flags have been cleared
+     */
+    _fmath_map[e]();
+    
+    /* 
+     * At this point, the "computation"-phase (I forget what the correct
+     * 6888x term is) is over. Now we check exception bits, throw exceptions,
+     * compute condition codes, and round and store the result.
+     */
+computation_done:
+    
+    /* Convert the 128-bit result to the specified precision */
+    /*
+     * FIXME: If fpu->write_back==0, should we still go through rounding?
+     *        The condition codes will still need to be set. Should they
+     *        be set based on the intermediate result or rounded result?
+     */
+    rounded_result = _fmath_round_intermediate_result();
+    
+    
+    /* Update the accrued exception bits */
+    
+    assert(!es_bsun); // no fmath op can throw es_bsun
+    
+    ae_iop |= es_bsun | es_snan | es_operr;
+    ae_ovfl |= es_ovfl;
+    ae_unfl |= (es_unfl & es_inex2); // yes, &
+    ae_dz |= es_dz;
+    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
+    
+    slog("FPU: bsun=%u snan=%u operr=%u ovfl=%u unfl=%u dz=%u inex1=%u inex2=%u\n",
+           es_bsun, es_snan, es_operr, es_ovfl, es_unfl, es_dz, es_inex1, es_inex2);
+    
+    /* Are any exceptions both set and enabled? */
+    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
+        /* 
+         * Then we need to throw an exception.
+         * The exception is sent to the vector for
+         * the highest priority exception, and the priority
+         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
+         * (which is the order of the bits in fpsr/fpcr).
+         * Iterate over the bits in order, and throw the
+         * exception to whichever bit is set first.
+         */
+        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
+        
+        slog("FPU: throw exception! 0x%08x\n", throwable);
+        
+        assert(throwable);
+        for (i=0; 1; i++) {
+            if (throwable & 0x80)
+                break;
+            throwable <<= 1;
+        }
+        
+        /*
+         * Convert the exception bit position
+         * to the correct vector number, and throw
+         * a (pre-instruction) exception.
+         */
+        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
+        
+        return ;
+    }
+    
+    /*
+     * Otherwise, no exceptions to throw!
+     * Calculate the condition codes from the result.
+     */
+    _fmath_set_condition_codes(rounded_result);
+    
+    /*
+     * We're definitely running to completion now,
+     * so commit ea-read changes
+     */
+    _fpu_read_ea_commit(source_specifier);
+    
+    /* Write back the result, and we're done! */
+    if (fpu->write_back) {
+        fpu->fp[dest_register] = rounded_result;
+        
+        long double tmp = _floatx80_to_long_double(rounded_result);
+        slog("FPU: result = %Lf\n", tmp);
+    }
+}
+
+#pragma mark Second-hop non-fmath instructions
+
+/*
+ * reg->mem fmove (fmath handles all other fmoves
+ */
+static void inst_fmove (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    ~decompose(shoe.op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 011 fff sss KKKKKKK);
+    
+    _fpu_write_ea(M, f, &fpu->fp[s], K);
+}
+
+static void inst_fmovem_control (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op,  1111 001 000 mmmrrr);
+    ~decompose(shoe.op,  1111 001 000 MMMMMM);
     ~decompose(ext, 10d CSI 0000000000);
     
     const uint32_t count = C + S + I;
     const uint32_t size = count * 4;
-    
-    if (count == 0) // I don't know if this is even a valid instruction
-        return ;
-    
-    if ((m == 0 || m == 1) && (count > 1)) { // data and addr reg modes are valid, but only if count==1
-        throw_illegal_instruction();
-        return ;
-    }
-    
     uint32_t addr, buf[3];
     uint32_t i;
     
+    /* I don't know if this is even a valid instruction */
+    if (count == 0)
+        return ;
+    
+    /* data and addr reg modes are valid, but only if count==1 */
+    if ((m == 0 || m == 1) && (count > 1)) {
+        _throw_illegal_instruction();
+        return ;
+    }
+    
     if (d) { // reg to memory
         i=0;
-        if (C) buf[i++] = shoe.fpcr.raw;
-        if (S) buf[i++] = shoe.fpsr.raw;
-        if (I) buf[i++] = shoe.fpiar;
-
+        if (C) buf[i++] = fpu->fpcr.raw;
+        if (S) buf[i++] = fpu->fpsr.raw;
+        if (I) buf[i++] = fpu->fpiar;
+        
         if (m == 0) {
-            shoe.d[r] = buf[0];
+            if (count == 1)
+                shoe.d[r] = buf[0];
+            else
+                _throw_illegal_instruction();
             return ;
         }
         else if (m == 1) {
-            shoe.a[r] = buf[0];
+            if ((count == 1) && I)
+                shoe.a[r] = buf[0];
+            else
+                _throw_illegal_instruction();
             return ;
         }
         else if (m == 3)
@@ -549,6 +2624,11 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         else if (m == 4)
             addr = shoe.a[r] - size;
         else {
+            if ((m==7) && (r!=0 || r!=1)) {
+                /* Not allowed for reg->mem */
+                _throw_illegal_instruction();
+                return;
+            }
             call_ea_addr(M);
             addr = shoe.dat;
         }
@@ -560,10 +2640,20 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         }
     }
     else { // mem to reg
-        if (m == 0) // data reg
-            buf[0] = shoe.d[r];
-        else if (m == 1) // addr reg
-            buf[0] = shoe.a[r];
+        if (m == 0) {// data reg
+            if (count == 1)
+                buf[0] = shoe.d[r];
+            else
+                _throw_illegal_instruction();
+            return;
+        }
+        else if (m == 1) {// addr reg
+            if ((count == 1) && I)
+                buf[0] = shoe.a[r];
+            else
+                _throw_illegal_instruction();
+            return;
+        }
         else {
             if (m == 3) // post-increment
                 addr = shoe.a[r];
@@ -586,22 +2676,22 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         i = 0;
         
         if (C) {
-            uint8_t round = shoe.fpcr.b.mc_rnd;
-            shoe.fpcr.raw = buf[i++];
-            uint8_t newround = shoe.fpcr.b.mc_rnd;
+            uint8_t round = fpu->fpcr.b._mc_rnd;
+            fpu->fpcr.raw = buf[i++];
+            uint8_t newround = fpu->fpcr.b._mc_rnd;
             
             if (round != newround) {
                 slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
             }
         }
-        if (S) shoe.fpsr.raw = buf[i++];
-        if (I) shoe.fpiar = buf[i++];
+        if (S) fpu->fpsr.raw = buf[i++];
+        if (I) fpu->fpiar = buf[i++];
         
         // Commit immediate-EA-mode PC change
         if (M == 0x3c)
             shoe.pc += size;
     }
-        
+    
     // Commit pre/post-inc/decrement
     
     if (m == 3)
@@ -612,133 +2702,16 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
     
     
     slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
+    
+    
 }
 
-void dis_fmovem_control(uint16_t op, uint16_t ext)
+static void inst_fmovem (const uint16_t ext)
 {
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 10d CSI 0000000000);
+    fpu_get_state_ptr();
     
-    if (d)
-        sprintf(dis.str, "fmovem.l [%u%u%u],%s\n", C, S, I, decode_ea_addr(M)); // <- XXX: decode_ea_addr() is the wrong function to use
-    else
-        sprintf(dis.str, "fmovem.l %s,[%u%u%u]\n", decode_ea_addr(M), C, S, I); // <- XXX: decode_ea_addr() is the wrong function to use
-}
-
-
-static uint8_t fpu_test_cc(uint8_t cc)
-{
-    const uint8_t z = shoe.fpsr.b.cc_z;
-    const uint8_t n = shoe.fpsr.b.cc_n;
-    const uint8_t nan = shoe.fpsr.b.cc_nan;
-    
-    switch (cc & 0x0f) {
-        case 0: // false
-            return 0;
-        case 1: // equal
-            return z;
-        case 2: // greater than
-            return !(nan || z || n);
-        case 3: // greater than or equal
-            return z || !(nan || n);
-        case 4: // less than
-            return n && !(nan || z);
-        case 5: // less than or equal
-            return z || (n && !nan);
-        case 6: // greater or less than
-            return !(nan || z);
-        case 7: // ordered
-            return !nan;
-        case 8: // unordered
-            return nan;
-        case 9: // not (greater or less than)
-            return nan || z;
-        case 10: // not (less than or equal)
-            return nan || !(n || z);
-        case 11: // not (less than)
-            return nan || (z || !n);
-        case 12: // not (greater than or equal)
-            return nan || (n && !z);
-        case 13: // not (greater than)
-            return nan || z || n;
-        case 14: // not equal
-            return !z;
-        case 15: // true
-            return 1;
-    }
-    
-    assert(0);
-    return 0;
-}
-
-void inst_fbcc(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
-    
-    uint32_t displacement;
-    if (s) {
-        const uint16_t ext2 = nextword();
-        displacement = (ext << 16) | ext2;
-    }
-    else {
-        const int16_t tmp = ext;
-        const int32_t tmp2 = tmp;
-        displacement = tmp2;
-    }
-    
-    if (b) {
-        slog("inst_fbcc: fixme: Got a CC that wants to set BSUN, not implemented\n");
-        //assert(0); // FIXME: implement BSUN, or uncomment this
-    }
-    
-    if (fpu_test_cc(c)) {
-        const uint32_t addr = shoe.orig_pc + 2 + displacement;
-        shoe.pc = addr;
-    }
-}
-
-const char *fpu_cc_names[32] = {
-    "f", "eq", "ogt", "oge", "olt", "ole", "ogl", "or",
-    "un", "ueq", "ugt", "uge", "ult", "ule", "ne", "t",
-    "sf", "seq", "gt", "ge", "lt", "le", "gl", "gle",
-    "ngle", "ngl", "nle", "nlt", "nge", "ngt", "sne", "st"
-};
-
-void dis_fbcc(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 01 s 0ccccc); // only the low 5 bits of cc are significant
-    
-    uint32_t displacement;
-    if (s) {
-        const uint16_t ext2 = dis_next_word();
-        displacement = (ext << 16) | ext2;
-    }
-    else {
-        const int16_t tmp = ext;
-        const int32_t tmp2 = tmp;
-        displacement = tmp2;
-    }
-    
-    const uint32_t addr = dis.orig_pc + 2 + displacement;
-    
-    sprintf(dis.str, "fb%s.%c *0x%08x", fpu_cc_names[c], "wl"[s], addr);
-}
-
-static void reverse_order(uint8_t *buf, const uint32_t size)
-{
-    uint32_t i;
-    for (i=0; i < (size/2); i++) {
-        const uint8_t tmp = buf[i];
-        buf[i] = buf[size-(1+i)];
-        buf[size-(1+i)] = tmp;
-    }
-}
-
-void inst_fmovem(uint16_t op, uint16_t ext)
-{
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(shoe.op,  1111 001 000 mmmrrr);
+    ~decompose(shoe.op,  1111 001 000 MMMMMM);
     ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
     ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
     
@@ -789,9 +2762,9 @@ void inst_fmovem(uint16_t op, uint16_t ext)
                 continue;
             
             uint8_t buf[12];
-            x87_to_motorola(shoe.fp[i], buf);
+            _floatx80_to_extended(&fpu->fp[i], buf);
             
-            slog("inst_fmovem: writing %Lf from fp%u", shoe.fp[i], i);
+            // slog("inst_fmovem: writing %Lf from fp%u", fpu->fp[i], i);
             uint32_t j;
             for (j=0; j<12; j++) {
                 slog(" %02x", buf[j]);
@@ -817,9 +2790,9 @@ void inst_fmovem(uint16_t op, uint16_t ext)
                 if (shoe.abort)
                     return ;
             }
-            shoe.fp[i] = motorola_to_x87(buf);
+            _extended_to_floatx80(buf, &fpu->fp[i]);
             
-            slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
+            // slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
         }
     }
     
@@ -828,712 +2801,305 @@ void inst_fmovem(uint16_t op, uint16_t ext)
         shoe.a[r] += size;
     else if (m == 4)
         shoe.a[r] -= size;
-    
-    //slog("inst_fmovem: notice: not implemented (EA = %u/%u, mask=0x%02x)\n", m, r, mask);
-    
 }
 
-void dis_fmovem(uint16_t op, uint16_t ext)
-{
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
-    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
-    
-    sprintf(dis.str, "fmovem ???");
-}
 
-enum {
-    format_L = 0,
-    format_S = 1,
-    format_X = 2,
-    format_Ps = 3,
-    format_W = 4,
-    format_D = 5,
-    format_B = 6,
-    format_Pd = 7
-} fpu_formats;
-/*
- * 0 L     long word integer
- * 1 S     single precision real
- * 2 X     extended precision real
- * 3 P{#k} packed decimal real with static k factor
- * 4 W     word integer
- * 5 D     double precision real
- * 6 B     byte integer
- * 7 P{Dn} packed decimal real with dynamic k factor
+#pragma mark First-hop decoder table inst implementations
+/* 
+ * The table generated by decoder_gen.c will refer directly
+ * to these instructions. inst_fpu_other() will handle all
+ * other FPU instructions.
  */
 
-static void fpu_read_ea_commit(uint8_t mr, uint8_t format)
+static _Bool fpu_test_cc(uint8_t cc)
 {
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    fpu_get_state_ptr();
+    const _Bool z = cc_z;
+    const _Bool n = cc_n;
+    const _Bool nan = cc_nan;
     
-    if (m == 3)
-        shoe.a[r] += sizes[format];
-    else if (m == 4)
-        shoe.a[r] -= sizes[format];
+    switch (cc & 0x0f) {
+        case 0: // false
+            return 0;
+        case 1: // equal
+            return z;
+        case 2: // greater than
+            return !(nan | z | n);
+        case 3: // greater than or equal
+            return z | !(nan | n);
+        case 4: // less than
+            return n & !(nan | z);
+        case 5: // less than or equal
+            return z | (n & !nan);
+        case 6: // greater or less than
+            return !(nan | z);
+        case 7: // ordered
+            return !nan;
+        case 8: // unordered
+            return nan;
+        case 9: // not (greater or less than)
+            return nan | z;
+        case 10: // not (less than or equal)
+            return nan | !(n | z);
+        case 11: // not (less than)
+            return nan | (z | !n);
+        case 12: // not (greater than or equal)
+            return nan | (n & !z);
+        case 13: // not (greater than)
+            return nan | z | n;
+        case 14: // not equal
+            return !z;
+        case 15: // true
+            return 1;
+    }
+    
+    assert(0);
+    return 0;
 }
 
-// Note: fpu_read_ea modifies shoe.pc, fpu_read_ea_commit modies shoe.a[r] for pre/post-inc/decrement
-static long double fpu_read_ea(uint8_t mr, uint8_t format)
-{
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+void inst_fscc () {
+    fpu_get_state_ptr();
     
-    long double data, result;
+    // fscc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    const uint16_t ext = nextword();
+    
+    ~decompose(shoe.op, 1111 001 001 MMMMMM);
+    ~decompose(ext, 0000 0000 000 b cccc);
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    shoe.dat = fpu_test_cc(c) ? 0xff : 0;
+    
+    call_ea_write(M, 1);
+}
+
+void inst_fbcc () {
+    fpu_get_state_ptr();
+    
+    // fbcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    ~decompose(shoe.op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
+    const uint8_t sz = 2 << s;
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception 
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    if (fpu_test_cc(c)) {
+        const uint16_t ext = nextword();
+        uint32_t displacement;
+    
+        if (s) {
+            const uint16_t ext2 = nextword();
+            displacement = (ext << 16) | ext2;
+        }
+        else
+            displacement = (int16_t)ext;
+        
+        shoe.pc = shoe.orig_pc + 2 + displacement;
+    }
+    else
+        shoe.pc += sz;
+}
+
+void inst_fsave () {
+    fpu_get_state_ptr();
+    verify_supervisor();
+    
+    // Don't modify fpiar for fsave
+    
+    ~decompose(shoe.op, 1111 001 100 MMMMMM);
+    ~decompose(shoe.op, 1111 001 100 mmmrrr);
+    
+    const uint32_t size = 0x1c; // IDLE frame
+    const uint16_t frame_header = 0xfd18;
     uint32_t addr;
     
-    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
-    if ((m == 1) ||
-        ((m == 0) && (sizes[format] > 4))) {
-        throw_illegal_instruction();
-        return 0.0;
+    if (m == 4)
+        addr = shoe.a[r] - size;
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
     }
     
-    switch (m) {
-        case 0: {
-            if (format == format_S) {
-                float tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_B) {
-                int8_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_W) {
-                int16_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_L) {
-                int32_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            
-            goto got_data;
-        }
-            
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && sizes[format]==1));
-            goto got_address;
-            
-        case 4:
-            addr = shoe.a[r] - sizes[format];
-            assert(!( r==7 && sizes[format]==1));
-            goto got_address;
-            
-        case 7:
-            if (r == 4) {
-                addr = shoe.pc;
-                shoe.pc += sizes[format];
-                goto got_address;
-            }
-            
-            // fall through to default:
-            
-        default: {
-            
-            shoe.mr=mr;
-            ea_addr();
-            if (shoe.abort)
-                return 0.0;
-            
-            addr = (uint32_t)shoe.dat;
-            goto got_address;
-        }
-    }
+    lset(addr, 2, frame_header);
+    if (shoe.abort)
+        return ;
     
-got_address:
+    if (m == 4)
+        shoe.a[r] = addr;
     
-    {
-        uint8_t buf[12];
-        uint8_t *ptr = &buf[sizes[format]];
-        uint32_t i;
-        
-        slog("inst_f fpu_read_ea: format=%u, data =", format);
-        for (i=0; i<sizes[format]; i++) {
-            ptr--;
-            *ptr = lget(addr+i, 1);
-            slog(" %02x", *ptr);
-            if (shoe.abort)
-                return 0.0;
-        }
-        
-        switch (format) {
-            case format_B: {
-                int8_t tmp = ptr[0];
-                data = tmp;
-                break;
-            }
-            case format_W: {
-                int16_t tmp = *(int16_t*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_L: {
-                int32_t tmp = *(int32_t*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_S: {
-                float tmp = *(float*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_D: {
-                double tmp = *(double*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_X: {
-                reverse_order(ptr, 12);
-                data = motorola_to_x87(ptr);
-                break;
-            }
-            default: {
-                assert(!"unsupported format (packed something)");
-            }
-        }
-    }
-    
-got_data:
-    
-    fpu_set_round();
-    result = data;
-    fpu_reset_round();
-    slog(" data=%Lf result=%Lf\n", data, result);
-    return result;
 }
 
-
-static void fpu_write_ea(uint8_t mr, uint8_t format, long double orig_data)
-{
-    fpu_set_round();
-    const long double data = orig_data;
-    fpu_reset_round();
+void inst_frestore () {
+    fpu_get_state_ptr();
+    verify_supervisor();
     
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-    uint8_t buf[12], *ptr = &buf[0];
-    uint32_t addr, i;
+    // Don't modify fpiar for frestore
     
-    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
-    if ((m == 1) ||
-        ((m == 0) && (sizes[format] > 4))) {
-        throw_illegal_instruction();
+    ~decompose(shoe.op, 1111 001 101 MMMMMM);
+    ~decompose(shoe.op, 1111 001 101 mmmrrr);
+    
+    uint32_t addr, size;
+    
+    if (m == 3)
+        addr = shoe.a[r];
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    const uint16_t word = lget(addr, 2);
+    if (shoe.abort) return ;
+    
+    // XXX: These frame sizes are different on 68881/68882/68040
+    if ((word & 0xff00) == 0x0000)
+        size = 4; // NULL state frame
+    else if ((word & 0xff) == 0x0018)
+        size = 0x1c; // IDLE state frame
+    else if ((word & 0xff) == 0x00b4)
+        size = 0xb8; // BUSY state frame
+    else {
+        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
+        assert(!"inst_frestore: bad state frame");
         return ;
     }
     
-    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, data, format);
-    
-    // Convert to the appropriate format
-    
-    switch (format) {
-        case format_B: {
-            int8_t tmp = data;
-            *((int8_t*)ptr) = tmp;
-            goto write_to_mem;
-        }
-        case format_W: {
-            int16_t tmp = data;
-            *((int16_t*)ptr) = tmp;
-            slog("inst_f fpu_write_ea formatted=%u (0x%04x)\n", *((int16_t*)ptr), *((uint16_t*)ptr));
-            break;
-        }
-        case format_L: {
-            int32_t tmp = data;
-            *((int32_t*)ptr) = tmp;
-            break;
-        }
-        case format_S: {
-            float tmp = data;
-            *((float*)ptr) = tmp;
-            break;
-        }
-        case format_D: {
-            double tmp = data;
-            *((double*)ptr) = tmp;
-            break;
-        }
-        case format_X: {
-            x87_to_motorola(data, ptr);
-            goto write_to_mem; // ptr is already big endian
-        }
-        default: {
-            assert(!"unsupported format (packed something)");
-        }
+    if (m==3) {
+        shoe.a[r] += size;
+        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
     }
-
-swap_order:
-    reverse_order(buf, sizes[format]);
-    
-    
-write_to_mem:
-    // Lookup the EA
-
-    switch (m) {
-        case 0: {
-            if (format == format_B) {
-                int8_t tmp = data;
-                set_d(r, tmp, 1);
-            }
-            else if (format == format_W) {
-                int16_t tmp = data;
-                set_d(r, tmp, 2);
-            }
-            else if (format == format_L) {
-                int32_t tmp = data;
-                shoe.d[r] = tmp;
-            }
-            else if (format == format_S) {
-                float tmp = data;
-                *((float*)&shoe.d[r]) = tmp;
-            }
-            
-            goto done;
-        }
-        case 3: // post-increment
-            addr = shoe.a[r];
-            assert(!( r==7 && sizes[format]==1));
-            break;
-        case 4: // pre-decrement
-            addr = shoe.a[r] - sizes[format];
-            assert(!( r==7 && sizes[format]==1));
-            break;
-        default:
-            call_ea_addr(mr);
-            addr = (uint32_t)shoe.dat;
-            break;
-    }
-    
-    // Copy the formatted data into the EA
-    slog("inst_f fpu_write_ea: addr=0x%08x\n", addr);
-    for (i=0; i < sizes[format]; i++) {
-        lset(addr + i, 1, buf[i]);
-        if (shoe.abort)
-            return ;
-    }
-
-done: // set condition codes and update pre/post-inc/decrement registers
-    
-    // Set condition codes
-    shoe.fpsr.raw &= 0x00ffffff;
-    shoe.fpsr.b.cc_nan = (0 != isnan(data));
-    if (!shoe.fpsr.b.cc_nan) {
-        shoe.fpsr.b.cc_n = (0 != signbit(data));
-        if (isinf(data))
-            shoe.fpsr.b.cc_i = 1;
-        else
-            shoe.fpsr.b.cc_z = (data == 0.0);
-    }
-    
-    if (m == 3)
-        shoe.a[r] += sizes[format];
-    else if (m == 4)
-        shoe.a[r] -= sizes[format];
 }
 
-void inst_fmove(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 0 E V aaa zzz KKKKKKK);
+void inst_fdbcc () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 001 001 rrr);
     
-    const uint8_t format = a;
+    // fdbcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
     
-    if (K == ~b(1000100) || K == ~b(1000000)) {
-        assert(!"inst_fmove: This is either a K-value, or somebody called fmove and specified the secret precision bits");
-    }
+    const uint16_t ext = nextword();
+    ~decompose(ext, 0000 0000 000 b cccc);
     
-    // E==0 => Don't use EA (reg->reg)
-    // E==1 => Use EA
-    // V==0 => reg->reg or mem->reg
-    // V==1 => reg->mem
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
     
-    // Load the source value into 'data'
-    
-    long double data;
-    
-    if (E && !V) { // mem -> reg
-        data = fpu_read_ea(M, format);
-        if (shoe.abort)
-            return ;
-    }
-    else if (!E) { // reg -> mem
-        data = shoe.fp[a];
-    }
-    else { // reg -> reg
-        data = shoe.fp[z];
-    }
-    
-    
-    // XXX: Check for exceptions?
-    
-    // Write the result
-    
-    if (E && V) { // reg -> mem
-        fpu_write_ea(M, format, data);
-        if (shoe.abort)
-            return ;
-    }
-    else if (!V) { // mem -> reg
-        fpu_set_reg_cc(data, z);
-        fpu_read_ea_commit(M, format);
-    }
-    else { // reg -> reg
-        fpu_set_reg_cc(data, z);
-    }
-    
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-    slog("inst_fmove src=%Lf size=%u a=%u z=%u to-mem=%u useEA=%u EA = %u/%u\n", data, sizes[format], a, z, V, E, m, r);
-}
-
-void dis_fnop(uint16_t op, uint16_t ext)
-{
-    sprintf(dis.str, "fnop");
-}
-
-void inst_fnop(uint16_t op, uint16_t ext)
-{
-}
-
-void dis_fmove(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 0 E V aaa bbb KKKKKKK);
-    
-    // E==0 => reg to reg
-    // E==1 => mem to reg / reg to mem
-    // V==0 => reg->reg or mem->reg
-    // V==1 => reg->mem
-    
-    
-    sprintf(dis.str, "fmove ???");
-    
-}
-
-void dis_fmath(uint16_t op, uint16_t ext)
-{
-    sprintf(dis.str, "fmath ??");
-}
-
-static void fpu_set_fpsr_quotient(long double a, long double b, long double result)
-{
-    // Thanks for being super vague on the meaning of this register, 68881 documentation
-    
-    const long double quo = truncl((a - result) / b);
-    const uint8_t sign = signbit(quo);
-    const uint64_t quo_int = fabsl(quo);
-    
-    shoe.fpsr.b.qu_quotient = quo_int & 0x7f;
-    shoe.fpsr.b.qu_s = sign;
-}
-
-void inst_fmath(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    uint8_t do_write_back_result = 1;
-    
-    long double source, dest, result;
-    
-    if (src_in_ea) {
-        source = fpu_read_ea(M, source_specifier);
-        slog("inst_fmath: source = %u/%u = %Lf", M>>3, M&7, source);
-        if ((M>>3) == 3)
-            slog(" a[%u]=0x%08x", M&7, shoe.a[M&7]);
-        
-        if (shoe.abort)
-            return ;
+    if (fpu_test_cc(c)) {
+        shoe.pc += 2;
     }
     else {
-        source = shoe.fp[source_specifier];
-        slog("inst_fmath: source = fp%u = %Lf", source_specifier, source);
-    }
-    
-    dest = shoe.fp[dest_register];
-    slog("  dest = fp%u = %Lf\n", dest_register, dest);
-    
-    switch (e) {
-        case ~b(0000001): {// fpu_inst_fint
-            const uint8_t dir = shoe.fpcr.b.mc_rnd;
-            
-            // {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
-            
-            if (dir == 0)
-                result = roundl(source);
-            else if (dir == 1)
-                result = truncl(source);
-            else if (dir == 2)
-                result = floorl(source);
-            else
-                result = ceill(source);
-            
-            slog("inst_fint: source = %Lf result = %Lf round=%u\n", source, result, dir);
-            
-            break;
-        }
-        case ~b(0000010): assert(!"fpu_inst_fsinh;");
-        case ~b(0000011): // fpu_inst_fintrz
-            slog("inst_fintrz dest = %Lf source = %Lf\n", dest, source);
-            result = truncl(source);
-            break;
-            
-        case ~b(0000110): // flognp1
-            slog("inst_flognp1 dest = %Lf source = %Lf\n", dest, source);
-            assert(source > -1.0);
-            result = log1pl(source);
-            break;
-        case ~b(0001000): assert(!"fpu_inst_fetoxm1;");
-        case ~b(0001001): assert(!"fpu_inst_ftanh;");
-        case ~b(0001010): // fatan
-            slog("inst_fatan dest = %Lf source = %Lf\n", dest, source);
-            result = atanl(source);
-            break;
-            
-        case ~b(0001100): assert(!"fpu_inst_fasin;");
-        case ~b(0001101): assert(!"fpu_inst_fatanh;");
-        case ~b(0001110): // fsin
-            slog("inst_fsin dest = %Lf source = %Lf\n", dest, source);
-            result = sinl(source);
-            break;
-        case ~b(0001111): assert(!"fpu_inst_ftan;");
-        case ~b(0010000): // fetox
-            slog("inst_fetox dest = %Lf source = %Lf\n", dest, source);
-            result = expl(source);
-            break;
-        case ~b(0010001): assert(!"fpu_inst_ftwotox;");
-        case ~b(0010010): assert(!"fpu_inst_ftentox;");
-        case ~b(0010100): assert(!"fpu_inst_flogn;");
-        case ~b(0010101): assert(!"fpu_inst_flog10;");
-        case ~b(0010110): assert(!"fpu_inst_flog2;");
-        case ~b(0011001): assert(!"fpu_inst_fcosh;");
-        case ~b(0011100): assert(!"fpu_inst_facos;");
-        case ~b(0011101): // fcos
-            slog("fpu_inst_fcos dest = %Lf source = %Lf\n", dest, source);
-            result = cosl(source);
-            break;
-            
-        case ~b(0011110): {// fpu_inst_fgetexp
-            if (!((source > 0) || (source < 0)))
-                result = source; // positive or negative zero
-            else if (!isfinite(source)) {
-                assert(!"fgetexp: isinfl(source)");
-                // returns NAN and an exception bit - not implemented for the moment
-            }
-            else {
-                // Extract the debiased exponent from the 80-bit source
-                uint8_t motorola[12];
-                x87_to_motorola(source, motorola);
-                int32_t exp = (motorola[0] & 0x7f) << 8;
-                exp |= motorola[1];
-                exp -= 16383; // debias
-                result = exp;
-            }
-            break;
-        }
-        case ~b(0011111): assert(!"fpu_inst_fgetman;");
-        case ~b(0100001):
-            // don't forget to set fpu_set_fpsr_quotient();
-            assert(!"fpu_inst_fmod;");
-        
-        case ~b(0100100): assert(!"fpu_inst_fsgldiv");
-            
-        case ~b(0100101): { // fpu_inst_frem
-            assert(source != 0.0);
-            result = remainderl(dest, source);
-            fpu_set_fpsr_quotient(dest, source, result);
-            slog("inst_frem: dest = %Lf source = %Lf quot = %u result = %Lf\n", dest, source, shoe.fpsr.b.qu_quotient, result);
-            break;
-        }
-        case ~b(0100110): assert(!"fpu_inst_fscale;");
-            
-        case ~b(0111000): { // fpu_inst_fcmp
-            const long double diff = dest - source;
-            slog("inst_fcmp: dest = %Lf source = %Lf\n", dest, source);
-            fpu_set_cc(diff);
-            do_write_back_result = 0; // don't write result back to register
-            break;
-        }
-        case ~b(0111010): { // fpu_inst_ftst
-            slog("fpu_inst_ftst: dest = %Lf\n");
-            fpu_set_cc(source);
-            do_write_back_result = 0; // don't write result back to register
-            break;
-        }
-            
-        case ~b(1011100):
-        case ~b(1011000):
-            assert(!"inst_fabs: can't handle");
-        case ~b(0011000):// fpu_inst_fabs
-            result = fabsl(source);
-            slog("inst_fabs: source=%Lf result=%Lf\n", source, result);
-            break;
-            
-        case ~b(1100010):
-        case ~b(1100110):
-            assert(!"can't handle");
-        case ~b(0100010): { // fpu_inst_fadd
-            slog("inst_fadd dest = %Lf source = %Lf\n", dest, source);
-            result = dest + source;
-            break;
-        }
-            
-        case ~b(1100000):
-        case ~b(1100100):
-            assert(!"can't handle");
-        case ~b(0100000): { // fpu_inst_fdiv
-            assert(source != 0.0);
-            slog("inst_fdiv dest = %Lf source = %Lf\n", dest, source);
-            
-            result = dest / source;
-            break;
-        }
-            
-            
-        case ~b(1100011):
-        case ~b(1100111):
-            assert(!"can't handle");
-        case ~b(0100011): { // fpu_inst_fmul
-            slog("inst_fmul dest = %Lf source = %Lf\n", dest, source);
-            result = source * dest;
-            break;
-        }
-            
-        case ~b(1011010):
-        case ~b(1011110):
-            assert(!"fneg: can't handle");
-        case ~b(0011010): // fneg
-            slog("inst_fneg dest = %Lf source = %Lf\n", dest, source);
-            result = -source;
-            break;
-            
-        case ~b(1000001):
-        case ~b(1000101):
-            assert(!"can't handle");
-        case ~b(0000100): { // fpu_inst_fsqrt
-            slog("inst_fsqrt dest = %Lf source = %Lf\n", dest, source);
-            result = sqrtl(source);
-            break;
-        }
-            
-        case ~b(1101000):
-        case ~b(1101100):
-            assert(!"can't handle");
-        case ~b(0101000): { // fpu_inst_fsub
-            slog("inst_fsub dest = %Lf source = %Lf\n", dest, source);
-            result = dest - source;
-            break;
-        }
-            
-        case ~b(0110000) ... ~b(0110111):
-            assert(!"fpu_inst_fsincos;");
-        
-        default:
-            assert(!"inst_fmath: unknown instruction");
-    }
-    
-    // Finalize the read, if source was in memory
-    if (src_in_ea) {
-        fpu_read_ea_commit(M, source_specifier);
-    }
-    
-    // Only write back the result if necessary (fcmp doesn't do this)
-    if (do_write_back_result) {
-        slog("inst_fmath: result = %Lf\n", result);
-        fpu_set_reg_cc(result, dest_register);
+        const int16_t disp = nextword();
+        const uint16_t newd = get_d(r, 2) - 1;
+        set_d(r, newd, 2);
+        if (newd != 0xffff)
+            shoe.pc = shoe.orig_pc + 2 + disp;
     }
 }
 
+void inst_ftrapcc () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 001 111 xyz);
+    
+    // ftrapcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    // (xyz) == (100) -> sz=0
+    // (xyz) == (010) -> sz=2
+    // (xyz) == (011) -> sz=4
+    const uint32_t sz = y << (z+1);
+    const uint32_t next_pc = shoe.orig_pc + 2 + sz;
+    
+    const uint16_t ext = nextword();
+    ~decompose(ext, 0000 0000 000 b cccc);
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    if (fpu_test_cc(c))
+        throw_frame_two(shoe.sr, next_pc, 7, shoe.orig_pc);
+    else
+        shoe.pc = next_pc;
+}
 
+void inst_fnop() {
+    // This is technically fbcc
+    inst_fbcc();
+}
 
+void inst_fpu_other () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    
+    const uint16_t ext = nextword();
+    ~decompose(ext, ccc xxx yyy eeeeeee);
+    
+    switch (c) {
+        case 0: // Reg to reg
+            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
+            inst_fmath(ext);
+            return;
+            
+        case 1: // unused
+            _throw_illegal_instruction();
+            return;
+            
+        case 2: // Memory->reg & movec
+            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
+            inst_fmath(ext);
+            return;
+            
+        case 3: // reg->mem
+            fpu->fpiar = shoe.orig_pc; // fmove() can throw an exception
+            inst_fmove(ext);
+            return;
+            
+        case 4: // mem -> sys ctl registers
+        case 5: // sys ctl registers -> mem
+            // fmovem_control() cannot throw an FPU exception (don't modify fpiar)
+            inst_fmovem_control(ext);
+            return;
+            
+        case 6: // movem to fp registers
+        case 7: // movem to memory
+            // fmovem() cannot throw an FPU exception (don't modify fpiar)
+            inst_fmovem(ext);
+            return;
+    }
+    
+    assert(0); // never get here
+    return;
+}
 
-// Setup the jump table for fpu instructions
-// XXX: come up with a better, unified system for decoding instructions
-void fpu_setup_jump_table()
+#pragma mark FPU-state initialization and reset
+
+void fpu_initialize()
 {
-    uint32_t i;
-    
-    
-    fpu_inst_table[fpu_inst_fnop].emu = inst_fnop;
-    fpu_inst_table[fpu_inst_fnop].dis = dis_fnop;
-    
-    fpu_inst_table[fpu_inst_fbcc].emu = inst_fbcc;
-    fpu_inst_table[fpu_inst_fbcc].dis = dis_fbcc;
-    
-    fpu_inst_table[fpu_inst_fmovecr].emu = inst_fmovecr;
-    fpu_inst_table[fpu_inst_fmovecr].dis = dis_fmovecr;
-    
-    fpu_inst_table[fpu_inst_fmove].emu = inst_fmove;
-    fpu_inst_table[fpu_inst_fmove].dis = dis_fmove;
-    
-    fpu_inst_table[fpu_inst_fmovem].emu = inst_fmovem;
-    fpu_inst_table[fpu_inst_fmovem].dis = dis_fmovem;
-    
-    fpu_inst_table[fpu_inst_fmovem_control].emu = inst_fmovem_control;
-    fpu_inst_table[fpu_inst_fmovem_control].dis = dis_fmovem_control;
-    
-    fpu_inst_table[fpu_inst_frestore].emu = inst_frestore;
-    fpu_inst_table[fpu_inst_frestore].dis = dis_frestore;
-
-    fpu_inst_table[fpu_inst_fsave].emu = inst_fsave;
-    fpu_inst_table[fpu_inst_fsave].dis = dis_fsave;
-    
-    const fpu_inst_name_t _fmath[] = {
-        fpu_inst_fsincos,
-        fpu_inst_fint,
-        fpu_inst_fsinh,
-        fpu_inst_fintrz,
-        fpu_inst_flognp1,
-        fpu_inst_fetoxm1,
-        fpu_inst_ftanh,
-        fpu_inst_fatan,
-        fpu_inst_fatanh,
-        fpu_inst_fsin,
-        fpu_inst_ftan,
-        fpu_inst_fetox,
-        fpu_inst_ftwotox,
-        fpu_inst_ftentox,
-        fpu_inst_flogn,
-        fpu_inst_flog10,
-        fpu_inst_flog2,
-        fpu_inst_fcosh,
-        fpu_inst_facos,
-        fpu_inst_fcos,
-        fpu_inst_fgetexp,
-        fpu_inst_fgetman,
-        fpu_inst_fmod,
-        fpu_inst_fsgldiv,
-        fpu_inst_fsglmul,
-        fpu_inst_frem,
-        fpu_inst_fscale,
-        fpu_inst_fcmp,
-        fpu_inst_ftst,
-        fpu_inst_fabs,
-        fpu_inst_fadd,
-        fpu_inst_fdiv,
-        fpu_inst_fmul,
-        fpu_inst_fneg,
-        fpu_inst_fsqrt,
-        fpu_inst_fsub
-    };
-    
-    for (i=0; i < sizeof(_fmath) / sizeof(fpu_inst_name_t); i++) {
-        fpu_inst_table[_fmath[i]].emu = inst_fmath;
-        fpu_inst_table[_fmath[i]].dis = dis_fmath;
-    }
+    fpu_state_t *fpu = (fpu_state_t*)p_alloc(shoe.pool, sizeof(fpu_state_t));
+    memset(fpu, sizeof(fpu_state_t), 0);
+    shoe.fpu_state = fpu;
 }
 
-
-
+void fpu_reset()
+{
+    p_free(shoe.fpu_state);
+    fpu_initialize();
+}
diff --git a/core/newfpu.c b/core/newfpu.c
deleted file mode 100644
index da7e072..0000000
--- a/core/newfpu.c
+++ /dev/null
@@ -1,3105 +0,0 @@
-/*
- * Copyright (c) 2013-2014, Peter Rutenbar <pruten@gmail.com>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "../core/shoebill.h"
-#include "../core/SoftFloat/softfloat.h"
-
-#pragma mark Structures and macros
-
-// Mode control byte
-#define mc_rnd  (fpu->fpcr.b._mc_rnd)
-#define mc_prec (fpu->fpcr.b._mc_prec)
-
-// Exception enable byte
-#define ee_inex1 (fpu->fpcr.b._ee_inex1)
-#define ee_inex2 (fpu->fpcr.b._ee_inex2)
-#define ee_dz    (fpu->fpcr.b._ee_dz)
-#define ee_unfl  (fpu->fpcr.b._ee_unfl)
-#define ee_ovfl  (fpu->fpcr.b._ee_ovfl)
-#define ee_operr (fpu->fpcr.b._ee_operr)
-#define ee_snan  (fpu->fpcr.b._ee_snan)
-#define ee_bsun  (fpu->fpcr.b._ee_bsun)
-
-// Accrued exception byte
-#define ae_inex (fpu->fpsr.b._ae_inex)
-#define ae_dz   (fpu->fpsr.b._ae_dz)
-#define ae_unfl (fpu->fpsr.b._ae_unfl)
-#define ae_ovfl (fpu->fpsr.b._ae_ovfl)
-#define ae_iop  (fpu->fpsr.b._ae_iop)
-
-// Exception status byte
-#define es_inex1 (fpu->fpsr.b._es_inex1)
-#define es_inex2 (fpu->fpsr.b._es_inex2)
-#define es_dz    (fpu->fpsr.b._es_dz)
-#define es_unfl  (fpu->fpsr.b._es_unfl)
-#define es_ovfl  (fpu->fpsr.b._es_ovfl)
-#define es_operr (fpu->fpsr.b._es_operr)
-#define es_snan  (fpu->fpsr.b._es_snan)
-#define es_bsun  (fpu->fpsr.b._es_bsun)
-
-// Quotient byte
-#define qu_quotient (fpu->fpsr.b._qu_quotient)
-#define qu_s        (fpu->fpsr.b._qu_s) /* quotient sign */
-
-// Condition codes
-#define cc_nan (fpu->fpsr.b._cc_nan)
-#define cc_i (fpu->fpsr.b._cc_i)
-#define cc_z (fpu->fpsr.b._cc_z)
-#define cc_n (fpu->fpsr.b._cc_n)
-
-
-typedef struct {
-    uint32_t fpiar; // FPU iaddr
-    
-    union { // fpcr, fpu control register
-        struct {
-            // Mode control byte
-            uint16_t _mc_zero : 4; // zero/dummy
-            uint16_t _mc_rnd  : 2; // rounding mode
-            uint16_t _mc_prec : 2; // rounding precision
-            // Exception enable byte
-            uint16_t _ee_inex1 : 1; // inexact decimal input
-            uint16_t _ee_inex2 : 1; // inxact operation
-            uint16_t _ee_dz    : 1; // divide by zero
-            uint16_t _ee_unfl  : 1; // underflow
-            uint16_t _ee_ovfl  : 1; // overflow
-            uint16_t _ee_operr : 1; // operand error
-            uint16_t _ee_snan  : 1; // signalling not a number
-            uint16_t _ee_bsun  : 1; // branch/set on unordered
-        } b;
-        
-        uint16_t raw;
-    } fpcr;
-    
-    union { // fpsr, fpu status register
-        struct {
-            // Accrued exception byte
-            uint32_t _dummy1  : 3; // dummy/zero
-            uint32_t _ae_inex : 1; // inexact
-            uint32_t _ae_dz   : 1; // divide by zero
-            uint32_t _ae_unfl : 1; // underflow
-            uint32_t _ae_ovfl : 1; // overflow
-            uint32_t _ae_iop  : 1; // invalid operation
-            // Exception status byte
-            uint32_t _es_inex1 : 1; // inexact decimal input
-            uint32_t _es_inex2 : 1; // inxact operation
-            uint32_t _es_dz    : 1; // divide by zero
-            uint32_t _es_unfl  : 1; // underflow
-            uint32_t _es_ovfl  : 1; // overflow
-            uint32_t _es_operr : 1; // operand error
-            uint32_t _es_snan  : 1; // signalling not a number
-            uint32_t _es_bsun  : 1; // branch/set on unordered
-            // Quotient byte
-            uint32_t _qu_quotient : 7;
-            uint32_t _qu_s        : 1;
-            // Condition code byte
-            uint32_t _cc_nan  : 1; // not a number
-            uint32_t _cc_i    : 1; // infinity
-            uint32_t _cc_z    : 1; // zero
-            uint32_t _cc_n    : 1; // negative
-            uint32_t _dummy2  : 4; // dummy/zero
-        } b;
-        uint32_t raw;
-    } fpsr;
-    
-    floatx80 fp[8]; // 80 bit floating point general registers
-    
-    // State for the static fmath instruction implementations
-    float128 source, dest, result;
-    _Bool write_back;
-    uint8_t fmath_op;
-} fpu_state_t;
-
-enum rounding_precision_t {
-    prec_extended = 0,
-    prec_single = 1,
-    prec_double = 2,
-};
-
-enum rounding_mode_t {
-    mode_nearest = 0,
-    mode_zero = 1,
-    mode_neg = 2,
-    mode_pos = 3
-};
-
-/*
- * 0 L     long word integer
- * 1 S     single precision real
- * 2 X     extended precision real
- * 3 P{#k} packed decimal real with static k factor
- * 4 W     word integer
- * 5 D     double precision real
- * 6 B     byte integer
- * 7 P{Dn} packed decimal real with dynamic k factor
- */
-static const uint8_t _format_sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-enum {
-    format_L = 0,
-    format_S = 1,
-    format_X = 2,
-    format_Ps = 3,
-    format_W = 4,
-    format_D = 5,
-    format_B = 6,
-    format_Pd = 7
-} fpu_formats;
-
-#define fpu_get_state_ptr() fpu_state_t *fpu = (fpu_state_t*)shoe.fpu_state
-#define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
-#define nextlong() ({const uint32_t L=lget(shoe.pc,4); if (shoe.abort) {return;}; shoe.pc+=4; L;})
-#define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
-
-#pragma mark FPU exception stuff
-enum fpu_vector_t {
-    fpu_vector_ftrapcc = 7,
-    fpu_vector_fline = 11,
-    fpu_vector_coprocessor_protocol_violation = 13, // won't be using this one
-    fpu_vector_bsun = 48,
-    fpu_vector_inexact = 49,
-    fpu_vector_divide_by_zero = 50,
-    fpu_vector_underflow = 51,
-    fpu_vector_operr = 52,
-    fpu_vector_overflow = 53,
-    fpu_vector_snan = 54
-};
-
-/*
- * Map the exception bit positions (in fpsr and fpcr)
- * to their corresponding exception vector numbers.
- */
-const uint8_t _exception_bit_to_vector[8] = {
-    48, // bsun
-    54, // snan
-    52, // operr
-    53, // ovfl
-    51, // unfl
-    50, // dz
-    49, // inex2
-    49, // inex1
-};
-
-static void throw_fpu_pre_instruction_exception(enum fpu_vector_t vector)
-{
-    throw_frame_zero(shoe.orig_sr, shoe.orig_pc, vector);
-}
-/*
- * Note: I may be able to get away without implementing the
- *       mid-instruction exception.
- */
-
-/*
- * _bsun_test() is called by every inst_f*cc instruction
- * to test whether the bsun exception is enabled, throw an
- * exception if so, and otherwise just set the appropriate
- * bit in fpsr, and update the accrued exception byte.
- */
-static _Bool _bsun_test()
-{
-    fpu_get_state_ptr();
-    
-    // BSUN counts against the IOP accrued exception bit
-    ae_iop = 1;
-    
-    // Set the BSUN exception status bit
-    es_bsun = 1;
-    
-    // If the BSUN exception isn't enabled, then we can just return
-    if (!ee_bsun)
-        return 0; // 0 -> elected not to throw an exception
-    
-    throw_fpu_pre_instruction_exception(fpu_vector_bsun);
-    return 1;
-}
-
-static void _throw_illegal_instruction()
-{
-    assert(!"throw_illegal_instruction!");
-}
-
-#pragma mark Float format translators (to/from big-endian motorola format)
-
-static void _floatx80_to_int8(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = tmp & 0xff;
-}
-
-static void _floatx80_to_int16(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = (tmp >> 8) & 0xff;
-    ptr[1] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_int32(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = (tmp >> 24) & 0xff;
-    ptr[1] = (tmp >> 16) & 0xff;
-    ptr[2] = (tmp >> 8) & 0xff;
-    ptr[3] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_single(floatx80 *f, uint8_t *ptr)
-{
-    const float32 tmp = floatx80_to_float32(*f);
-    ptr[0] = (tmp >> 24) & 0xff;
-    ptr[1] = (tmp >> 16) & 0xff;
-    ptr[2] = (tmp >> 8) & 0xff;
-    ptr[3] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_double(floatx80 *f, uint8_t *ptr)
-{
-    const float64 tmp = floatx80_to_float64(*f);
-    ptr[0] = (tmp >> 56) & 0xff;
-    ptr[1] = (tmp >> 48) & 0xff;
-    ptr[2] = (tmp >> 40) & 0xff;
-    ptr[3] = (tmp >> 32) & 0xff;
-    ptr[4] = (tmp >> 24) & 0xff;
-    ptr[5] = (tmp >> 16) & 0xff;
-    ptr[6] = (tmp >> 8) & 0xff;
-    ptr[7] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_extended(floatx80 *f, uint8_t *ptr)
-{
-    ptr[0] = (f->high >> 8) & 0xff;
-    ptr[1] = (f->high >> 0) & 0xff;
-    ptr[2] = 0;
-    ptr[3] = 0;
-    ptr[4] = (f->low >> 56) & 0xff;
-    ptr[5] = (f->low >> 48) & 0xff;
-    ptr[6] = (f->low >> 40) & 0xff;
-    ptr[7] = (f->low >> 32) & 0xff;
-    ptr[8] = (f->low >> 24) & 0xff;
-    ptr[9] = (f->low >> 16) & 0xff;
-    ptr[10] = (f->low >> 8) & 0xff;
-    ptr[11] = (f->low >> 0) & 0xff;
-}
-
-static float128 _int8_to_intermediate(int8_t byte)
-{
-    return int32_to_float128((int32_t)byte);
-}
-
-static float128 _int16_to_intermediate(int16_t sh)
-{
-    return int32_to_float128((int32_t)sh);
-}
-
-static float128 _int32_to_intermediate(int32_t in)
-{
-    return int32_to_float128(in);
-}
-
-static float128 _single_to_intermediate(uint32_t f)
-{
-    assert(sizeof(uint32_t) == sizeof(float32));
-    return float32_to_float128((float32)f);
-}
-
-/*
- * _double_to_intermediate(d): d needs to be 68k-native order (8 bytes)
- */
-static float128 _double_to_intermediate(uint8_t *d)
-{
-    assert(sizeof(uint64_t) == sizeof(float64));
-    
-    return float64_to_float128((float64) ntohll(*(uint64_t*)d));
-}
-
-/*
- * _extended_to_intermediate(e): e needs to be 68k-native order (12 bytes)
- */
-static float128 _extended_to_intermediate(uint8_t *e)
-{
-    /*
-     * softfloat floatx80 format:
-     * uint64_t low; // the low part of the extended float (significand, low exponent bits)
-     * uint16_t high; // the high part, sign, high exponent bits
-     */
-    floatx80 x80 = {
-        .high = (e[0] << 8) | e[1],
-        .low = ntohll(*(uint64_t*)&e[4])
-    };
-    return floatx80_to_float128(x80);
-}
-
-static void _extended_to_floatx80(uint8_t *bytes, floatx80 *f)
-{
-    f->high = (bytes[0] << 8) | bytes[1];
-    f->low = ntohll(*(uint64_t*)&bytes[4]);
-}
-
-/*
- * Set softfloat's rounding mode
- * (fpcr.mc_rnd and softfloat use different values for these modes)
- */
-static void _set_rounding_mode(enum rounding_mode_t mode)
-{
-    const int8_t rounding_map[4] = {
-        float_round_nearest_even, float_round_to_zero,
-        float_round_up, float_round_down
-    };
-    
-    float_rounding_mode = rounding_map[mode];
-}
-
-#pragma mark EA routines
-
-/*
- * Read-commit merely updates the address register
- * for pre/post-inc/decrement
- */
-static void _fpu_read_ea_commit(const uint8_t format)
-{
-    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
-    
-    if (m == 3) // post-increment
-        shoe.a[r] += _format_sizes[format];
-    else if (m == 4) // pre-decrement
-        shoe.a[r] -= _format_sizes[format];
-    
-    /* 
-     * Note: still unsure about what happens when
-     *       mode=pre/postincdecrement, size==1, and register==a7
-     *       (is the change +-2 bytes? or 1?)
-     */
-    if (((m == 3) || (m == 4)) && (_format_sizes[format] == 1) && (r == 7))
-        assert(!"size==1, reg==a7");
-}
-
-static void _fpu_write_ea(uint8_t mr, uint8_t format, floatx80 *f, uint8_t K)
-{
-    fpu_get_state_ptr();
-    
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t size = _format_sizes[format];
-    uint8_t buf[12], *ptr = &buf[0];
-    uint32_t addr, i;
-    
-    if ((m == 1) ||
-        ((m == 0) && (size > 4))) {
-        /* If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice */
-        _throw_illegal_instruction();
-        return ;
-    }
-    else if ((m == 7) && (r > 1)) {
-        /* If this is otherwise an illegal addr mode... */
-        _throw_illegal_instruction();
-        return ;
-    }
-    
-    const _Bool is_nan = ((f->high << 1) == 0xfffe) && f->low;
-    
-    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, 666.0L, format);
-    
-    /* Initialize softfloat's exceptions bits/rounding mode */
-    
-    float_exception_flags = 0;
-    _set_rounding_mode(mc_rnd);
-    
-    /* Convert to the appropriate format */
-    
-    switch (format) {
-        case format_B: {
-            _floatx80_to_int8(f, ptr);
-            break;
-        }
-        case format_W: {
-            _floatx80_to_int16(f, ptr);
-            break;
-        }
-        case format_L: {
-            _floatx80_to_int32(f, ptr);
-            break;
-        }
-        case format_S: {
-            _floatx80_to_single(f, ptr);
-            break;
-        }
-        case format_D: {
-            _floatx80_to_double(f, ptr);
-            break;
-        }
-        case format_X: {
-            _floatx80_to_extended(f, ptr);
-            break;
-        }
-        default: {
-            assert(!"unsupported format (packed something!)");
-        }
-    }
-    
-    /* Write to memory */
-    
-    switch (m) {
-        case 0: {
-            if (format == format_B)
-                set_d(r, ptr[0], 1);
-            else if (format == format_W)
-                set_d(r, ntohs(*(uint16_t*)ptr), 2);
-            else if ((format == format_L) || (format == format_S))
-                set_d(r, ntohl(*(uint32_t*)ptr), 4);
-            else
-                assert(!"how did I get here?");
-            goto done;
-        }
-        case 1:
-            assert(!"how did I get here again!");
-            
-        case 2:
-            addr = shoe.a[r];
-            break;
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && size==1));
-            break;
-        case 4: // pre-decrement
-            addr = shoe.a[r] - size;
-            assert(!( r==7 && size==1));
-            break;
-        default:
-            call_ea_addr(mr);
-            addr = (uint32_t)shoe.dat;
-            break;
-    }
-    
-    /* Copy the formatted data into *addr */
-    
-    slog("inst_f  fpu_write_ea: addr=0x08x data=0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-         addr,
-         ptr[0], ptr[1], ptr[2], ptr[3],
-         ptr[4], ptr[5], ptr[6], ptr[7],
-         ptr[8], ptr[9], ptr[10], ptr[11]);
-    
-    for (i=0; i<size; i++) {
-        lset(addr + i, 1, buf[i]);
-        if (shoe.abort) return ;
-    }
-    
-    
-done:
-    /* 
-     * Set exception bits and update pre/post/blah registers.
-     * note: condition codes are not modified
-     */
-    
-    es_bsun = 0;
-    es_snan = 0;
-    es_operr = 0;
-    es_ovfl = 0;
-    es_unfl = 0;
-    es_dz = 0;
-    es_inex2 = 0;
-    es_inex1 = 0;
-    
-    switch (format) {
-        format_B:
-        format_W:
-        format_L:
-            /* Set snan, operr, and/or inex2 */
-            es_snan = is_nan;
-            es_operr = ((float_exception_flags & float_flag_invalid) != 0);
-            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
-            break;
-        
-        format_S:
-        format_D:
-        format_X:
-            /* Set snan, ovfl, unfl, and/or inex2 */
-            es_snan = is_nan;
-            es_ovfl = ((float_exception_flags & float_flag_overflow) != 0);
-            es_unfl = ((float_exception_flags & float_flag_underflow) != 0);
-            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
-            break;
-            
-        format_Pd:
-        format_Ps:
-            /* Set snan, operr, and/or inex2 */
-            assert(!"you better implement packed formats");
-            break;
-    }
-    
-    /* Update the accrued exception bits */
-    ae_iop |= es_bsun | es_snan | es_operr;
-    ae_ovfl |= es_ovfl;
-    ae_unfl |= (es_unfl & es_inex2); // yes, &
-    ae_dz |= es_dz;
-    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
-    
-    /* Are any exceptions both set and enabled? */
-    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
-        /*
-         * Then we need to throw an exception.
-         * The exception is sent to the vector for
-         * the highest priority exception, and the priority
-         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
-         * (which is the order of the bits in fpsr/fpcr).
-         * Iterate over the bits in order, and throw the
-         * exception to whichever bit is set first.
-         */
-        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
-        
-        assert(throwable);
-        for (i=0; 1; i++) {
-            if (throwable & 0x80)
-                break;
-            throwable <<= 1;
-        }
-        
-        /*
-         * Convert the exception bit position
-         * to the correct vector number, and throw
-         * a (pre-instruction) exception.
-         */
-        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
-        
-        return ;
-    }
-    
-    /* Finalize registers, and we're done */
-    
-    if (m == 3)
-        shoe.a[r] += size;
-    else if (m == 4)
-        shoe.a[r] -= size;
-}
-
-/*
- * Note: fpu_read_ea modifies shoe.pc, and fpu_read_ea_commit
- *        modifies shoe.a[x] for pre/post-inc/decrement
- * Returns false if we're aborting
- */
-static _Bool _fpu_read_ea(const uint8_t format, float128 *result)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
-    
-    const uint8_t size = _format_sizes[format];
-    uint32_t addr = 0;
-    
-    /*
-     * Step 1: find the effective address, store it in addr
-     *         (or the actual data, if unavailable)
-     */
-    
-    slog("FPU: read_ea: mr=%u%u f=%c ", m, r, "lsxpwdb?"[format]);
-    
-    switch (m) {
-        case 0:
-            if (format == format_S)
-                *result = _single_to_intermediate(shoe.d[r]);
-            else if (format == format_B)
-                *result = _int8_to_intermediate(shoe.d[r] & 0xff);
-            else if (format == format_W)
-                *result = _int16_to_intermediate(shoe.d[r] & 0xffff);
-            else if (format == format_L)
-                *result = int32_to_float128(shoe.d[r]);
-            else {
-                /*
-                 * No other format can be used with a data register
-                 * (because they require >4 bytes)
-                 */
-                _throw_illegal_instruction();
-                return 0;
-            }
-            slog("raw=0x%x", chop(shoe.d[r], size));
-            goto got_data;
-            
-        case 1:
-            /* Address regisers can't be used */
-            _throw_illegal_instruction();
-            return 0;
-        
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && size==1));
-            goto got_address;
-            
-        case 4:
-            addr = shoe.a[r] - size;
-            assert(!( r==7 && size==1));
-            goto got_address;
-            
-        case 7:
-            if (r == 4) {
-                addr = shoe.pc;
-                shoe.pc += size;
-                goto got_address;
-            }
-            
-            // fall through to default:
-            
-        default: {
-            ~decompose(shoe.op, 0000 0000 00 MMMMMM);
-            shoe.mr = M;
-            ea_addr();
-            if (shoe.abort)
-                return 0;
-            
-            addr = (uint32_t)shoe.dat;
-            goto got_address;
-        }
-
-    }
-    
-got_address:
-    
-    /*
-     * Step 2: Load the data from the effective address
-     */
-    
-    slog("raw=0x");
-    if (size <= 4) {
-        const uint32_t raw = lget(addr, size);
-        if (shoe.abort)
-            return 0;
-        printf("%x ", raw);
-        switch (format) {
-            case format_B:
-                *result = _int8_to_intermediate(raw & 0xff);
-                break;
-            case format_W:
-                *result = _int16_to_intermediate(raw & 0xffff);
-                break;
-            case format_L:
-                *result = _int32_to_intermediate(raw);
-                break;
-            case format_S:
-                *result = _single_to_intermediate(raw);
-                break;
-            default:
-                assert(0); /* never get here */
-        }
-    }
-    else { // if (size > 4) -> if format is double, extended, or packed
-        uint8_t buf[12];
-        uint32_t i;
-        
-        for (i = 0; i < size; i++) {
-            buf[i] = lget(addr + i, 1);
-            slog("%02x", buf[i]);
-            if (shoe.abort)
-                return 0;
-        }
-        
-        switch (format) {
-            case format_D:
-                *result = _double_to_intermediate(buf);
-                break;
-            case format_X:
-                *result = _extended_to_intermediate(buf);
-                break;
-            case format_Ps:
-            // case format_Pd: // not possible as a src specifier
-                // FIXME: implement packed formats
-                assert(!"Somebody tried to use a packed format!\n");
-                // _throw_illegal_instruction();
-                // return 0;
-            default:
-                assert(0); // never get here
-        }
-    }
-    
-got_data:
-    printf("\n");
-    return 1;
-}
-
-#pragma mark Hacky low-precision transcendental implementations
-/*
- * s -> sign, e -> biased exponent
- * ma -> 48 high bits of the mantissa
- * mb -> 64 low bits of the mantissa
- */
-#define _assemble_float128(s, e, ma, mb) ({ \
-    const uint64_t _ma = (ma), _mb = (mb); \
-    const uint64_t _e = (e), _s = (s); \
-    float128 f = { \
-        .high = ((_s != 0) << 16) | (_e & 0x7fff), \
-        .low = _mb \
-    }; \
-    f.high = ((f.high) << 48) | _ma; \
-    f; \
-})
-
-#define HACKY_MATH_X86
-#ifdef HACKY_MATH_X86
-#define NATIVE double
-
-double _to_native(float128 f128)
-{
-    float64 f64 = float128_to_float64(f128);
-    double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    ptr[7] = (f64 >> 56) & 0xff;
-    ptr[6] = (f64 >> 48) & 0xff;
-    ptr[5] = (f64 >> 40) & 0xff;
-    ptr[4] = (f64 >> 32) & 0xff;
-    ptr[3] = (f64 >> 24) & 0xff;
-    ptr[2] = (f64 >> 16) & 0xff;
-    ptr[1] = (f64 >> 8) & 0xff;
-    ptr[0] = (f64 >> 0) & 0xff;
-    return result;
-}
-
-float128 _from_native(double n)
-{
-    float64 f64 = 0;
-    uint8_t *ptr = (uint8_t*)&n;
-    f64 = (f64 << 8) | ptr[7];
-    f64 = (f64 << 8) | ptr[6];
-    f64 = (f64 << 8) | ptr[5];
-    f64 = (f64 << 8) | ptr[4];
-    f64 = (f64 << 8) | ptr[3];
-    f64 = (f64 << 8) | ptr[2];
-    f64 = (f64 << 8) | ptr[1];
-    f64 = (f64 << 8) | ptr[0];
-    return float64_to_float128(f64);
-}
-
-#include <math.h>
-#define _native_cos(a) cos(a)
-#define _native_acos(a) acos(a)
-#define _native_cosh(a) cosh(a)
-#define _native_sin(a) sin(a)
-#define _native_asin(a) asin(a)
-#define _native_sinh(a) sinh(a)
-#define _native_tan(a) tan(a)
-#define _native_atan(a) atan(a)
-#define _native_tanh(a) tanh(a)
-#define _native_atanh(a) atanh(a)
-#define _native_pow(a, b) pow((a), (b))
-#define _native_exp(a) exp(a)
-#define _native_expm1(a) (exp(a) - 1.0) /* or expm1() */
-#define _native_log10(a) log10(a)
-#define _native_log2(a) (log(a) / log(2.0)) /* or log2() */
-#define _native_log(a) log(a)
-#define _native_log1p(a) log((a) + 1.0) /* or log1p() */
-
-const double _native_e = 2.71828182845904509;
-const double _native_10 = 10.0;
-const double _native_2 = 2.0;
-const double _native_1 = 1.0;
-
-#elif (defined(HACKY_MATH_PPC))
-#error "PowerPC hacky math isn't implemented yet"
-#else
-#error "You need to define HACKY_MATH_X86, or implement one for your arch"
-#endif
-
-static float128 _hack_cos (float128 x) {
-    return _from_native(_native_cos(_to_native(x)));
-}
-
-static float128 _hack_acos (float128 x) {
-    return _from_native(_native_acos(_to_native(x)));
-}
-
-static float128 _hack_cosh (float128 x) {
-    return _from_native(_native_cosh(_to_native(x)));
-}
-
-static float128 _hack_sin (float128 x) {
-    return _from_native(_native_sin(_to_native(x)));
-}
-
-static float128 _hack_asin (float128 x) {
-    return _from_native(_native_asin(_to_native(x)));
-}
-
-static float128 _hack_sinh (float128 x) {
-    return _from_native(_native_sinh(_to_native(x)));
-}
-
-static float128 _hack_tan (float128 x) {
-    return _from_native(_native_tan(_to_native(x)));
-}
-
-static float128 _hack_atan (float128 x) {
-    return _from_native(_native_atan(_to_native(x)));
-}
-
-static float128 _hack_tanh (float128 x) {
-    return _from_native(_native_tanh(_to_native(x)));
-}
-
-static float128 _hack_atanh (float128 x) {
-    return _from_native(_native_atanh(_to_native(x)));
-}
-
-static float128 _hack_etox (float128 x) {
-    return _from_native(_native_exp(_to_native(x)));
-}
-
-static float128 _hack_etoxm1 (float128 x) {
-    return _from_native(_native_expm1(_to_native(x)));
-}
-
-static float128 _hack_log10 (float128 x) {
-    return _from_native(_native_log10(_to_native(x)));
-}
-
-static float128 _hack_log2 (float128 x) {
-    return _from_native(_native_log2(_to_native(x)));
-}
-
-static float128 _hack_logn (float128 x) {
-    return _from_native(_native_log(_to_native(x)));
-}
-
-static float128 _hack_lognp1 (float128 x) {
-    return _from_native(_native_log1p(_to_native(x)));
-}
-
-static float128 _hack_tentox (float128 x) {
-    return _from_native(_native_pow(_native_10, _to_native(x)));
-}
-
-static float128 _hack_twotox (float128 x) {
-    return _from_native(_native_pow(_native_2, _to_native(x)));
-}
-
-#pragma mark FMATH! and all its helpers
-
-/* Function prototypes from SoftFloat/softfloat-specialize.h */
-char float128_is_nan(float128 a);
-char float128_is_signaling_nan (float128 a);
-
-
-static void inst_fmath_fmovecr (void)
-{
-    fpu_get_state_ptr();
-
-    /*
-     * FYI: these constants are stored in the "intermediate" 85-bit
-     *      format in the 6888x rom. This has the side effect that
-     *      they are rounded according to fpcr.mc_rnd.
-     *      We emulate the intermediate 85-bit format with float128.
-     */
-    
-    switch (fpu->fmath_op) {
-        case 0x00: // pi
-            fpu->result = _assemble_float128(0, 0x4000, 0x921fb54442d1, 0x8469898cc51701b8);
-            break;
-        case 0x0b: // log_10(2)
-            fpu->result = _assemble_float128(0, 0x3ffd, 0x34413509f79f, 0xef311f12b35816f9);
-            break;
-        case 0x0c: // e
-            fpu->result = _assemble_float128(0, 0x4000, 0x5bf0a8b14576, 0x95355fb8ac404e7a);
-            break;
-        case 0x0d: // log_2(e)
-            fpu->result = _assemble_float128(0, 0x3fff, 0x71547652b82f, 0xe1777d0ffda0d23a);
-            break;
-        case 0x0e: // log_10(e)
-            // NOTE: 68881 doesn't set inex2 for this one
-            // Also note: that's bogus. 68881 uses 3 trailing mantissa bits to do rounding,
-            // and those bits are non-zero for this number, so it must actually be stored
-            // incorrectly in the ROM.
-            // I'll emulate this by truncating the float128 mantissa.
-            
-            // fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a6ab7555f5a67);
-            fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a000000000000);
-            break;
-        case 0x0f: // 0.0
-            fpu->result = _assemble_float128(0, 0, 0, 0);
-            break;
-        case 0x30: // ln(2)
-            fpu->result = _assemble_float128(0, 0x3ffe, 0x62e42fefa39e, 0xf35793c7673007e5);
-            break;
-        case 0x31: // ln(10)
-            fpu->result = _assemble_float128(0, 0x4000, 0x26bb1bbb5551, 0x582dd4adac5705a6);
-            break;
-        case 0x32: // 1 (68kprm has typesetting issues everywhere. This one says 100, but means 10^0.)
-            fpu->result = _assemble_float128(0, 0x3fff, 0x0, 0x0);
-            break;
-        case 0x33: // 10
-            fpu->result = _assemble_float128(0, 0x4002, 0x400000000000, 0x0);
-            break;
-        case 0x34: // 10^2
-            fpu->result = _assemble_float128(0, 0x4005, 0x900000000000, 0x0);
-            break;
-        case 0x35: // 10^4
-            fpu->result = _assemble_float128(0, 0x400c, 0x388000000000, 0x0);
-            break;
-        case 0x36: // 10^8
-            fpu->result = _assemble_float128(0, 0x4019, 0x7d7840000000, 0x0);
-            break;
-        case 0x37: // 10^16
-            fpu->result = _assemble_float128(0, 0x4034, 0x1c37937e0800, 0x0);
-            break;
-        case 0x38: // 10^32
-            fpu->result = _assemble_float128(0, 0x4069, 0x3b8b5b5056e1, 0x6b3be04000000000);
-            break;
-        case 0x39: // 10^64
-            fpu->result = _assemble_float128(0, 0x40d3, 0x84f03e93ff9f, 0x4daa797ed6e38ed6);
-            break;
-        case 0x3a: // 10^128
-            fpu->result = _assemble_float128(0, 0x41a8, 0x27748f9301d3, 0x19bf8cde66d86d62);
-            break;
-        case 0x3b: // 10^256
-            fpu->result = _assemble_float128(0, 0x4351, 0x54fdd7f73bf3, 0xbd1bbb77203731fd);
-            break;
-        case 0x3c: // 10^512
-            fpu->result = _assemble_float128(0, 0x46a3, 0xc633415d4c1d, 0x238d98cab8a978a0);
-            break;
-        case 0x3d: // 10^1024
-            fpu->result = _assemble_float128(0, 0x4d48, 0x92eceb0d02ea, 0x182eca1a7a51e316);
-            break;
-        case 0x3e: // 10^2048
-            fpu->result = _assemble_float128(0, 0x5a92, 0x3d1676bb8a7a, 0xbbc94e9a519c6535);
-            break;
-        case 0x3f: // 10^4096
-            fpu->result = _assemble_float128(0, 0x7525, 0x88c0a4051441, 0x2f3592982a7f0094);
-            break;
-        default:
-            /*
-             * I wanted to include the actual values for the other ROM offsets,
-             * but they might be proprietary. Most of them are 0 anyways, and some
-             * cause FPU exceptions, even with all exceptions disabled... (?)
-             * 68040 FPSP just returns 0, so we'll do that too.
-             */
-            fpu->result = _assemble_float128(0, 0, 0, 0);
-            return ;
-    }
-}
-
-/*
- * This is quick macro.pl macro to build a jump table
- * for fmath instructions. It is probably slightly slower 
- * to use a jump table rather than a big switch statement,
- * but I think it looks cleaner.
- */
-~newmacro(create_fmath_jump_table, 0, {
-    my $name_map = {};
-    my $op_map = {};
-    my $add = sub {
-        my $op = shift;
-        my $name = lc(shift);
-        my $mode = 'foo';
-        my $arch = 68881;
-        
-        foreach my $arg (@_) {
-            if (($arg eq 'monadic') or ($arg eq 'dyadic')) {
-                $mode = $arg;
-            }
-            elsif ($arg == 68040) {
-                $arch = $arg;
-            }
-            else {
-                croak("bad arg $arg");
-            }
-        }
-        
-        croak("didn't specify mode") if ($mode eq "foo");
-        croak("dup $op $name") if exists $op_map->{$op};
-        croak("bogus") if ($op > 127);
-        
-        $op_map->{$op} = {op => $op, name => $name, mode => $mode, arch => $arch};
-        $name_map->{$name} = $op_map->{$op};
-    };
-    
-    $add->(~b(1000000), 'fmove',    'monadic', 68040);
-    $add->(~b(1000100), 'fmove',    'monadic', 68040);
-    $add->(~b(0000000), 'fmove',    'monadic');
-    
-    $add->(~b(0000001), 'fint',     'monadic');
-    $add->(~b(0000010), 'fsinh',    'monadic');
-    $add->(~b(0000011), 'fintrz',   'monadic');
-    
-    $add->(~b(1000001), 'fsqrt',    'monadic', 68040);
-    $add->(~b(1000101), 'fsqrt',    'monadic', 68040);
-    $add->(~b(0000100), 'fsqrt',    'monadic');
-    
-    $add->(~b(0000110), 'flognp1',  'monadic');
-    $add->(~b(0001000), 'fetoxm1',  'monadic');
-    $add->(~b(0001001), 'ftanh',    'monadic');
-    $add->(~b(0001010), 'fatan',    'monadic');
-    $add->(~b(0001100), 'fasin',    'monadic');
-    $add->(~b(0001101), 'fatanh',   'monadic');
-    $add->(~b(0001110), 'fsin',     'monadic');
-    $add->(~b(0001111), 'ftan',     'monadic');
-    $add->(~b(0010000), 'fetox',    'monadic');
-    $add->(~b(0010001), 'ftwotox',  'monadic');
-    $add->(~b(0010010), 'ftentox',  'monadic');
-    $add->(~b(0010100), 'flogn',    'monadic');
-    $add->(~b(0010101), 'flog10',   'monadic');
-    $add->(~b(0010110), 'flog2',    'monadic');
-    
-    $add->(~b(1011000), 'fabs',     'monadic', 68040);
-    $add->(~b(1011100), 'fabs',     'monadic', 68040);
-    $add->(~b(0011000), 'fabs',     'monadic');
-    
-    $add->(~b(0011001), 'fcosh',    'monadic');
-    
-    $add->(~b(1011010), 'fneg',     'monadic', 68040);
-    $add->(~b(1011110), 'fneg',     'monadic', 68040);
-    $add->(~b(0011010), 'fneg',     'monadic');
-    
-    $add->(~b(0011100), 'facos',    'monadic');
-    $add->(~b(0011101), 'fcos',     'monadic');
-    $add->(~b(0011110), 'fgetexp',  'monadic');
-    $add->(~b(0011111), 'fgetman',  'monadic');
-    
-    $add->(~b(1100000), 'fdiv',     'dyadic', 68040);
-    $add->(~b(1100100), 'fdiv',     'dyadic', 68040);
-    $add->(~b(0100000), 'fdiv',     'dyadic');
-    
-    $add->(~b(0100001), 'fmod',     'dyadic');
-    
-    $add->(~b(1100010), 'fadd',     'dyadic', 68040);
-    $add->(~b(1100110), 'fadd',     'dyadic', 68040);
-    $add->(~b(0100010), 'fadd',     'dyadic');
-    
-    $add->(~b(1100011), 'fmul',     'dyadic', 68040);
-    $add->(~b(1100111), 'fmul',     'dyadic', 68040);
-    $add->(~b(0100011), 'fmul',     'dyadic');
-    
-    $add->(~b(0100100), 'fsgldiv',  'dyadic');
-    $add->(~b(0100101), 'frem',     'dyadic');
-    $add->(~b(0100110), 'fscale',   'dyadic');
-    $add->(~b(0100111), 'fsglmul',  'dyadic');
-    
-    $add->(~b(1101000), 'fsub',     'dyadic', 68040);
-    $add->(~b(1101100), 'fsub',     'dyadic', 68040);
-    $add->(~b(0101000), 'fsub',     'dyadic');
-    
-    $add->(~b(0110000), 'fsincos',  'monadic');
-    $add->(~b(0110001), 'fsincos',  'monadic');
-    $add->(~b(0110010), 'fsincos',  'monadic');
-    $add->(~b(0110011), 'fsincos',  'monadic');
-    $add->(~b(0110100), 'fsincos',  'monadic');
-    $add->(~b(0110101), 'fsincos',  'monadic');
-    $add->(~b(0110110), 'fsincos',  'monadic');
-    $add->(~b(0110111), 'fsincos',  'monadic');
-    
-    
-    $add->(~b(0111000), 'fcmp',     'dyadic');
-    $add->(~b(0111010), 'ftst',     'monadic');
-    
-    my $map_str = "fmath_impl_t *_fmath_map[128] = {\n";
-    my @inst_flags = (0) x 128;
-    
-    for (my $i=0; $i < 128; $i++) {
-        my $func_ptr = "NULL";
-        if (exists $op_map->{$i}) {
-            $func_ptr = 'inst_fmath_' . $op_map->{$i}->{name};
-            if ($op_map->{$i}->{mode} eq 'dyadic') {
-                $inst_flags[$i] |= 1;
-            }
-            if ($op_map->{$i}->{arch} == 68040) {
-                $inst_flags[$i] |= 2;
-            }
-        }
-
-        $map_str .= "\t" . $func_ptr . ",\n";
-    }
-    $map_str .= "};\n\nuint8_t _fmath_flags[128] = {\n";
-    
-    for (my $i=0; $i < 128; $i++) {
-        $map_str .= "\t" . sprintf('0x%02x', $inst_flags[$i]) . ",\n";
-    }
-    $map_str .= "};\n";
-    
-    $map_str .= "const char *_fmath_names[128] = {\n";
-    for (my $i=0; $i < 128; $i++) {
-        my $name = "f???";
-        if (exists $op_map->{$i}) {
-            $name = $op_map->{$i}->{name};
-        }
-        $map_str .= "\t\"" . $name . "\",\n";
-    }
-    $map_str .= "};\n";
-    
-    return $map_str;
-})
-
-static _Bool _float128_is_zero (float128 f)
-{
-    return ((f.high << 1) == 0) && (f.low == 0);
-}
-
-static _Bool _float128_is_neg (float128 f)
-{
-    return f.high >> 63;
-}
-
-static _Bool _float128_is_infinity (float128 f)
-{
-    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
-    const uint64_t frac_b = f.low;
-    const uint16_t exp = (f.high >> 48) & 0x7fff;
-    
-    return (exp == 0x7fff) && ((frac_a | frac_b) == 0);
-}
-
-static _Bool _float128_is_nan (float128 f)
-{
-    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
-    const uint64_t frac_b = f.low;
-    const uint16_t exp = (f.high >> 48) & 0x7fff;
-    
-    return (exp == 0x7fff) && ((frac_a | frac_b) != 0);
-}
-
-const float128 _nan128 = {
-    .high = 0xFFFF800000000000ULL,
-    .low = 0
-};
-
-const float128 _one128 = {
-    .high = 0x3fff000000000000ULL,
-    .low = 0
-};
-
-const float128 _zero128 = {
-    .high = 0,
-    .low = 0
-};
-
-static void inst_fmath_fabs ()
-{
-    fpu_get_state_ptr();
-    
-    /* Clear the sign bit */
-    fpu->result = fpu->source;
-    fpu->result.high <<= 1;
-    fpu->result.high >>= 1;
-}
-
-static void inst_fmath_facos ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* Find the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is zero, result is +pi/2 */
-    if (source_zero) {
-        fpu->result = _assemble_float128(0, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
-        return;
-    }
-    /* If source isn't in range [-1, 1], return nan, set operr */
-    else if (!float128_le(tmp, _one128)) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_acos(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fadd ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_add(fpu->dest, fpu->source);
-    
-    /* 
-     * Throw operr (and return NaN) if operands are infinities
-     * with opposite signs. (I *think* softfloat is doing this
-     * corectly - the code's hard to read.)
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fasin ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* Find the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source isn't in range [-1, 1], return nan, set operr */
-    else if (!float128_le(tmp, _one128)) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_asin(fpu->source);
-    /* Set inex2?? */
-    /* Set unfl?? */
-}
-
-static void inst_fmath_fatan ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is +-pi/2 */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(source_sign, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
-        return ;
-    }
-    
-    fpu->result = _hack_atan(fpu->source);
-    /* Set inex2?? */
-    /* Set unfl?? */
-}
-
-static void inst_fmath_fatanh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* Take the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is 0, return source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If |source| == 1.0, set dz, return +-inf */
-    else if (float128_eq(tmp, _one128)) {
-        es_dz = 1;
-        fpu->result = _assemble_float128(source_sign, 0x7fff, 0, 0);
-        return;
-    }
-    /* If |source| > 1.0, set operr, return nan */
-    else if (!float128_le(tmp, _one128)) {
-        es_operr = 1;
-        fpu->result = _nan128;
-        return ;
-    }
-    
-    fpu->result = _hack_atanh(fpu->source);
-}
-
-static void inst_fmath_fcmp ()
-{
-    fpu_get_state_ptr();
-    
-    /* Don't write the result back to the register */
-    fpu->write_back = 0;
-    
-    fpu->result = float128_sub(fpu->dest, fpu->source);
-    
-    /*
-     * The 68881 docs say fcmp doesn't throw any exceptions
-     * based on the result, but I'm not sure I believe it.
-     
-     if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-     
-     if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-     */
-}
-
-static void inst_fmath_fcos ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_cos(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fcosh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return;
-    }
-    /* If source is +/- inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(0, 0x7fff, 0, 0);
-        return;
-    }
-    
-    fpu->result = _hack_cosh(fpu->source);
-}
-
-static void inst_fmath_fdiv ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_div(fpu->dest, fpu->source);
-    
-    /* Throw operr (and return NaN) if both operands are zero */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw divide-by-zero if dividend is zero */
-    if (float_exception_flags & float_flag_divbyzero)
-        es_dz = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fetox ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = _zero128;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_etox(fpu->source);
-}
-
-static void inst_fmath_fetoxm1 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = negone;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_etoxm1(fpu->source);
-}
-
-static void inst_fmath_fgetexp ()
-{
-    fpu_get_state_ptr();
-    
-    /* If source is INF, set operr and return NaN */
-    if (((fpu->source.high << 1) == 0xfffe000000000000ULL) && (fpu->source.low == 0)) {
-        es_operr = 1;
-        fpu->result.high = 0xffff000000000000ULL;
-        fpu->result.low = 0xc000000000000000ULL;
-        return ;
-    }
-    
-    /*
-     * If source is 0, return source.
-     * According to 68881 docs, the result needs to have 
-     * the same sign as the source (why?)
-     */
-    if (((fpu->source.high << 1) == 0) && (fpu->source.low == 0)) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    /*
-     * Otherwise, extract the biased exponent, convert it
-     * to a two's complement integer, and store that value
-     * as a float.
-     */
-    const uint32_t biased = (fpu->source.high << 1) >> 49;
-    fpu->result = int32_to_float128(((int32_t)biased) - 16383);
-}
-
-static void inst_fmath_fgetman ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fgetman not implemented");
-}
-
-static void inst_fmath_fint ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_round_to_int(fpu->source);
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-}
-
-static void inst_fmath_fintrz ()
-{
-    fpu_get_state_ptr();
-    
-    /* Same as fint, but force the round-to-zero mode */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    float_rounding_mode = float_round_to_zero;
-    fpu->result = float128_round_to_int(fpu->source);
-    float_rounding_mode = old_round_mode;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-}
-
-static void inst_fmath_flog10 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_log10(fpu->source);
-}
-
-static void inst_fmath_flog2 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_log2(fpu->source);
-}
-
-static void inst_fmath_flognp1 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is -1.0, set dz, result is -inf */
-    else if (float128_eq(negone, fpu->source)) {
-        es_dz = 1;
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        return;
-    }
-    /* If source < -1.0, set operr, result is nan */
-    else if (float128_lt(fpu->source, negone)) {
-        es_operr = 1;
-        fpu->result = _nan128;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_lognp1(fpu->source);
-}
-
-static void inst_fmath_flogn ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_logn(fpu->source);
-}
-
-static void inst_fmath_fmove ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = fpu->source;
-}
-
-static void inst_fmath_fmul ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_mul(fpu->dest, fpu->source);
-    
-    /*
-     * Throw operr (and return NaN) if one operand is infinity
-     * and the other is zero.
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fneg ()
-{
-    fpu_get_state_ptr();
-    
-    /* Flip the sign bit */
-    fpu->result = fpu->source;
-    fpu->result.high ^= (1ULL << 63);
-    
-    /* 
-     * FIXME: you're supposed to throw UNFL if this is a
-     *        denormalized number, I think.
-     */
-}
-
-static void inst_fmath_frem ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool dest_zero = _float128_is_zero(fpu->dest);
-    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
-    
-    /* I just assume the quotient/sign are 0 for the following cases */
-    qu_quotient = 0;
-    qu_s = 0;
-    
-    /* If source is zero, result is nan */
-    if (source_zero) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If dest (but not source) is zero, result is that zero */
-    else if (dest_zero) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    /* If dest is infinity, result is nan */
-    else if (dest_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If source, but not dest, is infinity, result is dest */
-    else if (source_inf) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    
-    /* -- We're past the edge cases, do the actual op -- */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    
-    /* frem uses round-to-nearest */
-    float_rounding_mode = float_round_nearest_even;
-    
-    float128 N = float128_div(fpu->dest, fpu->source);
-    N = float128_round_to_int(N);
-    
-    float_rounding_mode = old_round_mode;
-    
-    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
-    
-    /* FIXME: not sure how to set unfl reliably */
-    
-    _Bool sign = N.high >> 63; /* Remember the sign */
-    N.high <<= 1; /* Clear the sign */
-    N.high >>= 1;
-    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
-    qu_quotient = final & 0x7f;
-    qu_s = sign;
-}
-
-static void inst_fmath_fmod ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool dest_zero = _float128_is_zero(fpu->dest);
-    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
-    
-    /* I just assume the quotient/sign are 0 for the following cases */
-    qu_quotient = 0;
-    qu_s = 0;
-    
-    /* If source is zero, result is nan */
-    if (source_zero) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If dest (but not source) is zero, result is that zero */
-    else if (dest_zero) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    /* If dest is infinity, result is nan */
-    else if (dest_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If source, but not dest, is infinity, result is dest */
-    else if (source_inf) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    
-    /* -- We're past the edge cases, do the actual op -- */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    
-    /* fmod uses round-to-zero */
-    float_rounding_mode = float_round_to_zero;
-    
-    float128 N = float128_div(fpu->dest, fpu->source);
-    N = float128_round_to_int(N);
-    
-    float_rounding_mode = old_round_mode;
-    
-    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
-    
-    /* FIXME: not sure how to set unfl reliably */
-    
-    _Bool sign = N.high >> 63; /* Remember the sign */
-    N.high <<= 1; /* Clear the sign */
-    N.high >>= 1;
-    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
-    qu_quotient = final & 0x7f;
-    qu_s = sign;
-}
-
-static void inst_fmath_fscale ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fscale not implemented");
-}
-
-static void inst_fmath_fsgldiv ()
-{
-    fpu_get_state_ptr();
-    
-    float128 source = fpu->source;
-    float128 dest = fpu->dest;
-    
-    /* Dump the low 88 bits of the source/dest mantissas */
-    source.low = 0;
-    source.high &= 0xffffffffff000000;
-    dest.low = 0;
-    dest.high &= 0xffffffffff000000;
-    
-    fpu->result = float128_div(dest, source);
-    
-    /* Throw operr (and return NaN) if both operands are zero */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw divide-by-zero if dividend is zero */
-    if (float_exception_flags & float_flag_divbyzero)
-        es_dz = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fsglmul ()
-{
-    fpu_get_state_ptr();
-    
-    /*
-     * As far as I can tell, fsglmul/fsgldiv use an ALU
-     * for the mantissa that is only 24-bits wide. Everything
-     * else is done with regular internal precision.
-     */
-    
-    float128 source = fpu->source;
-    float128 dest = fpu->dest;
-    
-    /* Dump the low 88 bits of the source/dest mantissas */
-    source.low = 0;
-    source.high &= 0xffffffffff000000;
-    dest.low = 0;
-    dest.high &= 0xffffffffff000000;
-    
-    fpu->result = float128_mul(dest, source);
-    
-    /*
-     * Throw operr (and return NaN) if one operand is infinity
-     * and the other is zero.
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fsin ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_sin(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fsincos ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fsincos not implemented");
-}
-
-static void inst_fmath_fsinh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero or inf, return source */
-    if (source_zero || source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_sinh(fpu->source);
-}
-
-static void inst_fmath_fsqrt ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_sqrt(fpu->source);
-    
-    /* Throw operr (and return NaN) if the operand is < 0 */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-}
-
-static void inst_fmath_fsub ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_sub(fpu->dest, fpu->source);
-    
-    /*
-     * Throw operr (and return NaN) if operands are infinities
-     * with equal signs. (I *think* softfloat is doing this
-     * corectly - the code's hard to read.)
-     *
-     * Both 68kprm and 68881 docs say that (+inf) - (-inf) = (-inf)
-     * but I presume that's a typo, and it's supposed to be (+inf)
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_ftan ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_tan(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_ftanh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is +/- inf, result is +/- 1.0 */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(source_sign, 0x3fff, 0, 0);
-        return;
-    }
-    
-    fpu->result = _hack_tanh(fpu->source);
-}
-
-static void inst_fmath_ftentox ()
-{
-    fpu_get_state_ptr();
-    
-    // _hack_ftentox() is broken on clang 3.5 on osx 10.10
-    // (tries to optimize pow(10.0, x) to __exp10(x), and __exp10
-    //  isn't implemented in the 10.8 SDK)
-//    const _Bool source_zero = _float128_is_zero(fpu->source);
-//    const _Bool source_inf = _float128_is_infinity(fpu->source);
-//    const _Bool source_sign = _float128_is_neg(fpu->source);
-//    
-//    /* If source is zero, result is +1.0 */
-//    if (source_zero) {
-//        fpu->result = _one128;
-//        return ;
-//    }
-//    /* if source is -inf, result is +0.0 */
-//    else if (source_inf && source_sign) {
-//        fpu->result = _zero128;
-//        return ;
-//    }
-//    /* if source is +inf, result is +inf */
-//    else if (source_inf) {
-//        fpu->result = fpu->source;
-//        return ;
-//    }
-//    
-//    fpu->result = _hack_tentox(fpu->source);
-//    
-    assert(!"fmath: ftentox not implemented");
-}
-
-static void inst_fmath_ftst ()
-{
-    fpu_get_state_ptr();
-    
-    /* Don't write the result back to the register */
-    fpu->write_back = 0;
-    
-    /* ftst just sets the cond codes according to the source */
-    fpu->result = fpu->source;
-}
-
-static void inst_fmath_ftwotox ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = _zero128;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_twotox(fpu->source);
-}
-
-typedef void (fmath_impl_t)(void);
-#define FMATH_TYPE_DYADIC 1
-#define FMATH_TYPE_68040 2
-~create_fmath_jump_table()
-
-
-/*
- * Take fpu->result, and round and crop it to the
- * preferred precision, then return the result as
- * a floatx80. (Set all the appropriate exception bits
- * too)
- *
- * ALSO!! This checks and sets underflow/overflow
- */
-static floatx80 _fmath_round_intermediate_result ()
-{
-    fpu_get_state_ptr();
-    floatx80 final;
-    
-    float_exception_flags = 0; // (so we can know if the result is inexact)
-    _set_rounding_mode(mc_rnd); // Set the preferred rounding mode
-    
-    if (mc_prec == prec_extended) { // extended precision
-        final = float128_to_floatx80(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-    }
-    else if (mc_prec == prec_double) { // double precision
-        float64 tmp = float128_to_float64(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-        final = float64_to_floatx80(tmp);
-    }
-    else if (mc_prec == prec_single) { // single precision
-        float32 tmp = float128_to_float32(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-        final = float32_to_floatx80(tmp);
-    }
-    else
-        assert(!"bogus precision mode???");
-    
-    return final;
-}
-
-static void _fmath_set_condition_codes (floatx80 val)
-{
-    fpu_get_state_ptr();
-    const uint64_t frac = val.low;
-    const uint32_t exp = val.high & 0x7fff;
-    const _Bool sign = val.high >> 15;
-    
-    /* Clear the whole CC register byte */
-    fpu->fpsr.raw &= 0x00ffffff;
-    
-    /* Check for zero */
-    cc_z = ((exp == 0) && (frac == 0));
-    
-    /* Check for negative */
-    cc_n = sign;
-    
-    /* Check for NaN */
-    cc_nan = ((exp == 0x7fff) && ((frac << 1) != 0));
-    
-    /* Check for infinity */
-    cc_i = ((exp == 0x7fff) && ((frac << 1) == 0));
-}
-
-static void _fmath_handle_nans ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool is_dyadic = _fmath_flags[fpu->fmath_op] & FMATH_TYPE_DYADIC;
-    const _Bool is_signaling = float128_is_signaling_nan(fpu->source) ||
-                                (is_dyadic && float128_is_signaling_nan(fpu->dest));
-    const _Bool is_source_nan = float128_is_nan(fpu->source);
-    const _Bool is_dest_nan = is_dyadic && float128_is_nan(fpu->dest);
-    
-    /*
-     * If the dest is NaN, or both are NaN, let the result be set to dest.
-     * (with signaling disabled)
-     */
-    if (is_dest_nan)
-        fpu->result = fpu->dest;
-    else {
-        assert(is_source_nan);
-        fpu->result = fpu->source;
-    }
-    
-    /* Set the snan exception status bit */
-    es_snan = is_signaling;
-    
-    /* Silence the result */
-    // Signaling -> 0
-    // Non-signaling -> 1
-    fpu->result.high |= 0x800000000000;
-}
-
-void dis_fmath (uint16_t op, uint16_t ext, char *output)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    /* If this is fmovecr */
-    if (src_in_ea && (source_specifier == 7)) {
-        const char *name = NULL;
-        switch (extension) {
-            case 0x00: name = "pi"; break;
-            case 0x0b: name = "log_10(2)"; break;
-            case 0x0c: name = "c"; break;
-            case 0x0d: name = "log_2(e)"; break;
-            case 0x0e: name = "log_10(e)"; break;
-            case 0x0f: name = "0.0"; break;
-            case 0x30: name = "ln(2)"; break;
-            case 0x31: name = "ln(10)"; break;
-            case 0x32: name = "1.0"; break;
-            case 0x33: name = "10.0"; break;
-            case 0x34: name = "10.0^2"; break;
-            case 0x35: name = "10.0^4"; break;
-            case 0x36: name = "10.0^8"; break;
-            case 0x37: name = "10.0^16"; break;
-            case 0x38: name = "10.0^32"; break;
-            case 0x39: name = "10.0^64"; break;
-            case 0x3a: name = "10.0^128"; break;
-            case 0x3b: name = "10.0^256"; break;
-            case 0x3c: name = "10.0^512"; break;
-            case 0x3d: name = "10.0^1024"; break;
-            case 0x3e: name = "10.0^2048"; break;
-            case 0x3f: name = "10.0^4096"; break;
-        }
-        if (name == NULL)
-            sprintf(output, "fmovecr.x #%u,fp%u", extension, dest_register);
-        else
-            sprintf(output, "fmovecr.x %s,fp%u", name, dest_register);
-        return;
-    }
-    
-    if (_fmath_map[e] == NULL) {
-            /* This instruction isn't defined */
-            sprintf(output, "fmath???");
-    }
-    else if (_fmath_map[e] == inst_fmath_fsincos) {
-        /* fsincos.<fmt> <ea>,FPc:FPs */
-        if (src_in_ea)
-            sprintf(output, "fsincos.%c %s,fp%u:fp%u", "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]), e & 7, dest_register);
-        else
-            sprintf(output, "fsincos.x fp%u,fp%u:fp%u", source_specifier, e & 7, dest_register);
-    }
-    else if (_fmath_map[e] == inst_fmath_ftst) {
-        /* ftst.<fmt> <source> */
-        if (src_in_ea)
-            sprintf(output, "ftst.%c %s", "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]));
-        else
-            sprintf(output, "ftst.x fp%u", dest_register);
-    }
-    else {
-        /* f<inst>.<fmt> <source>,<dest> */
-        if (src_in_ea)
-            sprintf(output, "%s.%c %s,fp%u", _fmath_names[e], "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]), dest_register);
-        else
-            sprintf(output, "%s.x fp%u,fp%u", _fmath_names[e],
-                    source_specifier, dest_register);
-    }
-}
-
-static long double _float128_to_long_double(float128 f128)
-{
-    long double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    
-    int8_t old = float_exception_flags;
-    floatx80 f80 = float128_to_floatx80(f128);
-    float_exception_flags = old;
-    
-    ptr[9] = (f80.high >> 8) & 0xff;
-    ptr[8] = (f80.high >> 0) & 0xff;
-    ptr[7] = (f80.low >> 56) & 0xff;
-    ptr[6] = (f80.low >> 48) & 0xff;
-    ptr[5] = (f80.low >> 40) & 0xff;
-    ptr[4] = (f80.low >> 32) & 0xff;
-    ptr[3] = (f80.low >> 24) & 0xff;
-    ptr[2] = (f80.low >> 16) & 0xff;
-    ptr[1] = (f80.low >> 8) & 0xff;
-    ptr[0] = (f80.low >> 0) & 0xff;
-    
-    return result;
-}
-
-static long double _floatx80_to_long_double(floatx80 f80)
-{
-    long double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    
-    ptr[9] = (f80.high >> 8) & 0xff;
-    ptr[8] = (f80.high >> 0) & 0xff;
-    ptr[7] = (f80.low >> 56) & 0xff;
-    ptr[6] = (f80.low >> 48) & 0xff;
-    ptr[5] = (f80.low >> 40) & 0xff;
-    ptr[4] = (f80.low >> 32) & 0xff;
-    ptr[3] = (f80.low >> 24) & 0xff;
-    ptr[2] = (f80.low >> 16) & 0xff;
-    ptr[1] = (f80.low >> 8) & 0xff;
-    ptr[0] = (f80.low >> 0) & 0xff;
-    
-    return result;
-}
-
-static void inst_fmath (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    floatx80 rounded_result;
-    
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    slog("FPU:---\n");
-    
-    /* Throw illegal instruction for 040-only ops */
-    if (_fmath_flags[e] & FMATH_TYPE_68040) {
-        _throw_illegal_instruction();
-        return;
-    }
-    
-    /*
-     * All the documented fmath ops have an implementation in
-     * _fmath_map[]. If it's NULL, it's not documented; throw
-     * an exception.
-     * This probably matches what the 68040's behavior (I haven't
-     * checked), but the 68881 doesn't do this.
-     * 68881 throws an illegal instruction exception for all
-     * opcodes where the high (6th) bit of e is set.
-     * All other instructions seem to short circuit to the 
-     * nearest documented instruction.
-     * FIXME: consider implementing this behavior.
-     */
-    if (_fmath_map[e] == NULL) {
-        /* Unless this is fmovecr, where the extension doesn't matter */
-        if (!(src_in_ea && (source_specifier == 7))) {
-            _throw_illegal_instruction();
-            return ;
-        }
-    }
-    
-    /* We only need to load the dest reg for dyadic ops */
-    if (_fmath_flags[e] & FMATH_TYPE_DYADIC)
-        fpu->dest = floatx80_to_float128(fpu->fp[dest_register]);
-    
-    /*
-     * We'll shrink the precision and perform rounding
-     * just prior to writing back the result.
-     * Certain instructions override the precision
-     * in fpcr, so keep track of the prefered prec here.
-     */
-    enum rounding_precision_t rounding_prec = mc_prec;
-    
-    /*
-     * For all the intermediate calculations, we
-     * probably want to use nearest-rounding mode.
-     */
-    _set_rounding_mode(mode_nearest);
-    
-    /* Reset softfloat's exception flags */
-    float_exception_flags = 0;
-    
-    /* Reset fpsr's exception flags */
-    es_inex1 = 0; // this is only set for imprecisely-rounded packed inputs (not implemented)
-    es_inex2 = 0; // set if we ever lose precision (during the op or during rounding)
-    es_dz = 0;    // set if we divided by zero
-    es_unfl = 0;  // set if we underflowed (inex2 should be set too, I think)
-    es_ovfl = 0;  // set if we overflowed (inex2 should be set too, I think)
-    es_operr = 0; // set if there was an instruction specific operand error
-    es_snan = 0;  // Set if one of the inputs was a signaling NaN
-    es_bsun = 0;  // never set here
-    
-    fpu->write_back = 1; // let "do-write-back" be the default behavior
-    fpu->fmath_op = e;
-    
-    /* Handle fmovecr */
-    if (src_in_ea && (source_specifier == 7)) { // fmovecr
-        /* 
-         * 68kprm says M should be ~b(000000), but apparently
-         * any value will work for fmovecr
-         */
-        slog("FPU: fmovecr %u,fp%u\n", e, dest_register);
-        inst_fmath_fmovecr();
-        goto computation_done;
-    }
-    
-    /*
-     * Read in the source from the EA or from a register.
-     * In either case, convert the value to a float128,
-     * (that's our version of the 85-bit "intermediate" format)
-     */
-    if (src_in_ea) {
-        if (!_fpu_read_ea(source_specifier, &fpu->source))
-            return ;
-        slog("FPU: %s.%c ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
-    }
-    else {
-        fpu->source = floatx80_to_float128(fpu->fp[source_specifier]);
-        slog("FPU: %s.x ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
-    }
-
-    
-    {
-        long double tmp = _float128_to_long_double(fpu->source);
-        printf("%Lf,fp%u\n", tmp, dest_register);
-    }
-    
-    /*
-     * If the source is NaN, or this is a dyadic (two-operand)
-     * instruction, and the second operand (fpu->dest) is NaN,
-     * then the result is predetermined: NaN
-     */
-    if (float128_is_nan(fpu->source) ||
-             (((_fmath_flags[e] & FMATH_TYPE_DYADIC) &&
-               float128_is_nan(fpu->dest)))) {
-        _fmath_handle_nans();
-        goto computation_done;
-    }
-    
-    /* 
-     * Otherwise, call the extension-specific helper function.
-     * Guarantees: Neither source nor dest are NaN
-     *             SoftFloat's exception flags have been cleared
-     */
-    _fmath_map[e]();
-    
-    /* 
-     * At this point, the "computation"-phase (I forget what the correct
-     * 6888x term is) is over. Now we check exception bits, throw exceptions,
-     * compute condition codes, and round and store the result.
-     */
-computation_done:
-    
-    /* Convert the 128-bit result to the specified precision */
-    /*
-     * FIXME: If fpu->write_back==0, should we still go through rounding?
-     *        The condition codes will still need to be set. Should they
-     *        be set based on the intermediate result or rounded result?
-     */
-    rounded_result = _fmath_round_intermediate_result();
-    
-    
-    /* Update the accrued exception bits */
-    
-    assert(!es_bsun); // no fmath op can throw es_bsun
-    
-    ae_iop |= es_bsun | es_snan | es_operr;
-    ae_ovfl |= es_ovfl;
-    ae_unfl |= (es_unfl & es_inex2); // yes, &
-    ae_dz |= es_dz;
-    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
-    
-    slog("FPU: bsun=%u snan=%u operr=%u ovfl=%u unfl=%u dz=%u inex1=%u inex2=%u\n",
-           es_bsun, es_snan, es_operr, es_ovfl, es_unfl, es_dz, es_inex1, es_inex2);
-    
-    /* Are any exceptions both set and enabled? */
-    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
-        /* 
-         * Then we need to throw an exception.
-         * The exception is sent to the vector for
-         * the highest priority exception, and the priority
-         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
-         * (which is the order of the bits in fpsr/fpcr).
-         * Iterate over the bits in order, and throw the
-         * exception to whichever bit is set first.
-         */
-        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
-        
-        slog("FPU: throw exception! 0x%08x\n", throwable);
-        
-        assert(throwable);
-        for (i=0; 1; i++) {
-            if (throwable & 0x80)
-                break;
-            throwable <<= 1;
-        }
-        
-        /*
-         * Convert the exception bit position
-         * to the correct vector number, and throw
-         * a (pre-instruction) exception.
-         */
-        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
-        
-        return ;
-    }
-    
-    /*
-     * Otherwise, no exceptions to throw!
-     * Calculate the condition codes from the result.
-     */
-    _fmath_set_condition_codes(rounded_result);
-    
-    /*
-     * We're definitely running to completion now,
-     * so commit ea-read changes
-     */
-    _fpu_read_ea_commit(source_specifier);
-    
-    /* Write back the result, and we're done! */
-    if (fpu->write_back) {
-        fpu->fp[dest_register] = rounded_result;
-        
-        long double tmp = _floatx80_to_long_double(rounded_result);
-        slog("FPU: result = %Lf\n", tmp);
-    }
-}
-
-#pragma mark Second-hop non-fmath instructions
-
-/*
- * reg->mem fmove (fmath handles all other fmoves
- */
-static void inst_fmove (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    ~decompose(shoe.op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 011 fff sss KKKKKKK);
-    
-    _fpu_write_ea(M, f, &fpu->fp[s], K);
-}
-
-static void inst_fmovem_control (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op,  1111 001 000 mmmrrr);
-    ~decompose(shoe.op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 10d CSI 0000000000);
-    
-    const uint32_t count = C + S + I;
-    const uint32_t size = count * 4;
-    uint32_t addr, buf[3];
-    uint32_t i;
-    
-    /* I don't know if this is even a valid instruction */
-    if (count == 0)
-        return ;
-    
-    /* data and addr reg modes are valid, but only if count==1 */
-    if ((m == 0 || m == 1) && (count > 1)) {
-        _throw_illegal_instruction();
-        return ;
-    }
-    
-    if (d) { // reg to memory
-        i=0;
-        if (C) buf[i++] = fpu->fpcr.raw;
-        if (S) buf[i++] = fpu->fpsr.raw;
-        if (I) buf[i++] = fpu->fpiar;
-        
-        if (m == 0) {
-            if (count == 1)
-                shoe.d[r] = buf[0];
-            else
-                _throw_illegal_instruction();
-            return ;
-        }
-        else if (m == 1) {
-            if ((count == 1) && I)
-                shoe.a[r] = buf[0];
-            else
-                _throw_illegal_instruction();
-            return ;
-        }
-        else if (m == 3)
-            addr = shoe.a[r];
-        else if (m == 4)
-            addr = shoe.a[r] - size;
-        else {
-            if ((m==7) && (r!=0 || r!=1)) {
-                /* Not allowed for reg->mem */
-                _throw_illegal_instruction();
-                return;
-            }
-            call_ea_addr(M);
-            addr = shoe.dat;
-        }
-        
-        for (i=0; i<count; i++) {
-            lset(addr + (i*4), 4, buf[i]);
-            if (shoe.abort)
-                return ;
-        }
-    }
-    else { // mem to reg
-        if (m == 0) {// data reg
-            if (count == 1)
-                buf[0] = shoe.d[r];
-            else
-                _throw_illegal_instruction();
-            return;
-        }
-        else if (m == 1) {// addr reg
-            if ((count == 1) && I)
-                buf[0] = shoe.a[r];
-            else
-                _throw_illegal_instruction();
-            return;
-        }
-        else {
-            if (m == 3) // post-increment
-                addr = shoe.a[r];
-            else if (m == 4) // pre-decrement
-                addr = shoe.a[r] - size;
-            else if (M == 0x3c) // immediate
-                addr = shoe.pc;
-            else {
-                call_ea_addr(M); // call_ea_addr() should work for all other modes
-                addr = shoe.dat;
-            }
-            
-            for (i=0; i<count; i++) {
-                buf[i] = lget(addr + (i*4), 4);
-                if (shoe.abort)
-                    return ;
-            }
-        }
-        
-        i = 0;
-        
-        if (C) {
-            uint8_t round = fpu->fpcr.b._mc_rnd;
-            fpu->fpcr.raw = buf[i++];
-            uint8_t newround = fpu->fpcr.b._mc_rnd;
-            
-            if (round != newround) {
-                slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
-            }
-        }
-        if (S) fpu->fpsr.raw = buf[i++];
-        if (I) fpu->fpiar = buf[i++];
-        
-        // Commit immediate-EA-mode PC change
-        if (M == 0x3c)
-            shoe.pc += size;
-    }
-    
-    // Commit pre/post-inc/decrement
-    
-    if (m == 3)
-        shoe.a[r] += size;
-    if (m == 4)
-        shoe.a[r] -= size;
-    
-    
-    
-    slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
-    
-    
-}
-
-static void inst_fmovem (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op,  1111 001 000 mmmrrr);
-    ~decompose(shoe.op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
-    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
-    
-    const uint8_t pre_mask = s ? shoe.d[y] : L; // pre-adjusted mask
-    
-    // Count the number of bits in the mask
-    uint32_t count, maskcpy = pre_mask;
-    for (count=0; maskcpy; maskcpy >>= 1)
-        count += (maskcpy & 1);
-    
-    const uint32_t size = count * 12;
-    
-    // for predecrement mode, the mask is reversed
-    uint8_t mask = 0;
-    if (m == 4) {
-        uint32_t i;
-        for (i=0; i < 8; i++) {
-            const uint8_t bit = (pre_mask << i) & 0x80;
-            mask = (mask >> 1) | bit;
-        }
-    }
-    else
-        mask = pre_mask;
-    
-    uint32_t i, addr;
-    
-    // Find the EA
-    if (m == 3) {
-        addr = shoe.a[r];
-        assert(p); // assert post-increment mask
-    }
-    else if (m == 4) {
-        addr = shoe.a[r] - size;
-        assert(!p); // assert pre-decrement mask
-    }
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-        assert(p); // assert post-increment mask
-    }
-    
-    slog("inst_fmovem: pre=%08x mask=%08x EA=%u/%u addr=0x%08x size=%u %s\n", pre_mask, mask, m, r, addr, size, d?"to mem":"from mem");
-    
-    if (d) {
-        // Write those registers
-        for (i=0; i<8; i++) {
-            if (!(mask & (0x80 >> i)))
-                continue;
-            
-            uint8_t buf[12];
-            _floatx80_to_extended(&fpu->fp[i], buf);
-            
-            // slog("inst_fmovem: writing %Lf from fp%u", fpu->fp[i], i);
-            uint32_t j;
-            for (j=0; j<12; j++) {
-                slog(" %02x", buf[j]);
-                lset(addr, 1, buf[j]);
-                addr++;
-                if (shoe.abort)
-                    return ;
-            }
-            slog("\n");
-        }
-    }
-    else {
-        // Read those registers
-        for (i=0; i<8; i++) {
-            if (!(mask & (0x80 >> i)))
-                continue;
-            
-            uint8_t buf[12];
-            uint32_t j;
-            for (j=0; j<12; j++) {
-                buf[j] = lget(addr, 1);
-                addr++;
-                if (shoe.abort)
-                    return ;
-            }
-            _extended_to_floatx80(buf, &fpu->fp[i]);
-            
-            // slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
-        }
-    }
-    
-    // Commit the write for pre/post-inc/decrement
-    if (m == 3)
-        shoe.a[r] += size;
-    else if (m == 4)
-        shoe.a[r] -= size;
-}
-
-
-#pragma mark First-hop decoder table inst implementations
-/* 
- * The table generated by decoder_gen.c will refer directly
- * to these instructions. inst_fpu_other() will handle all
- * other FPU instructions.
- */
-
-static _Bool fpu_test_cc(uint8_t cc)
-{
-    fpu_get_state_ptr();
-    const _Bool z = cc_z;
-    const _Bool n = cc_n;
-    const _Bool nan = cc_nan;
-    
-    switch (cc & 0x0f) {
-        case 0: // false
-            return 0;
-        case 1: // equal
-            return z;
-        case 2: // greater than
-            return !(nan | z | n);
-        case 3: // greater than or equal
-            return z | !(nan | n);
-        case 4: // less than
-            return n & !(nan | z);
-        case 5: // less than or equal
-            return z | (n & !nan);
-        case 6: // greater or less than
-            return !(nan | z);
-        case 7: // ordered
-            return !nan;
-        case 8: // unordered
-            return nan;
-        case 9: // not (greater or less than)
-            return nan | z;
-        case 10: // not (less than or equal)
-            return nan | !(n | z);
-        case 11: // not (less than)
-            return nan | (z | !n);
-        case 12: // not (greater than or equal)
-            return nan | (n & !z);
-        case 13: // not (greater than)
-            return nan | z | n;
-        case 14: // not equal
-            return !z;
-        case 15: // true
-            return 1;
-    }
-    
-    assert(0);
-    return 0;
-}
-
-void inst_fscc () {
-    fpu_get_state_ptr();
-    
-    // fscc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    const uint16_t ext = nextword();
-    
-    ~decompose(shoe.op, 1111 001 001 MMMMMM);
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    shoe.dat = fpu_test_cc(c) ? 0xff : 0;
-    
-    call_ea_write(M, 1);
-}
-
-void inst_fbcc () {
-    fpu_get_state_ptr();
-    
-    // fbcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    ~decompose(shoe.op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
-    const uint8_t sz = 2 << s;
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception 
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c)) {
-        const uint16_t ext = nextword();
-        uint32_t displacement;
-    
-        if (s) {
-            const uint16_t ext2 = nextword();
-            displacement = (ext << 16) | ext2;
-        }
-        else
-            displacement = (int16_t)ext;
-        
-        shoe.pc = shoe.orig_pc + 2 + displacement;
-    }
-    else
-        shoe.pc += sz;
-}
-
-void inst_fsave () {
-    fpu_get_state_ptr();
-    verify_supervisor();
-    
-    // Don't modify fpiar for fsave
-    
-    ~decompose(shoe.op, 1111 001 100 MMMMMM);
-    ~decompose(shoe.op, 1111 001 100 mmmrrr);
-    
-    const uint32_t size = 0x1c; // IDLE frame
-    const uint16_t frame_header = 0xfd18;
-    uint32_t addr;
-    
-    if (m == 4)
-        addr = shoe.a[r] - size;
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    lset(addr, 2, frame_header);
-    if (shoe.abort)
-        return ;
-    
-    if (m == 4)
-        shoe.a[r] = addr;
-    
-}
-
-void inst_frestore () {
-    fpu_get_state_ptr();
-    verify_supervisor();
-    
-    // Don't modify fpiar for frestore
-    
-    ~decompose(shoe.op, 1111 001 101 MMMMMM);
-    ~decompose(shoe.op, 1111 001 101 mmmrrr);
-    
-    uint32_t addr, size;
-    
-    if (m == 3)
-        addr = shoe.a[r];
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    const uint16_t word = lget(addr, 2);
-    if (shoe.abort) return ;
-    
-    // XXX: These frame sizes are different on 68881/68882/68040
-    if ((word & 0xff00) == 0x0000)
-        size = 4; // NULL state frame
-    else if ((word & 0xff) == 0x0018)
-        size = 0x1c; // IDLE state frame
-    else if ((word & 0xff) == 0x00b4)
-        size = 0xb8; // BUSY state frame
-    else {
-        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
-        assert(!"inst_frestore: bad state frame");
-        return ;
-    }
-    
-    if (m==3) {
-        shoe.a[r] += size;
-        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
-    }
-}
-
-void inst_fdbcc () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 001 001 rrr);
-    
-    // fdbcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c)) {
-        shoe.pc += 2;
-    }
-    else {
-        const int16_t disp = nextword();
-        const uint16_t newd = get_d(r, 2) - 1;
-        set_d(r, newd, 2);
-        if (newd != 0xffff)
-            shoe.pc = shoe.orig_pc + 2 + disp;
-    }
-}
-
-void inst_ftrapcc () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 001 111 xyz);
-    
-    // ftrapcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    // (xyz) == (100) -> sz=0
-    // (xyz) == (010) -> sz=2
-    // (xyz) == (011) -> sz=4
-    const uint32_t sz = y << (z+1);
-    const uint32_t next_pc = shoe.orig_pc + 2 + sz;
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c))
-        throw_frame_two(shoe.sr, next_pc, 7, shoe.orig_pc);
-    else
-        shoe.pc = next_pc;
-}
-
-void inst_fnop() {
-    // This is technically fbcc
-    inst_fbcc();
-}
-
-void inst_fpu_other () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, ccc xxx yyy eeeeeee);
-    
-    switch (c) {
-        case 0: // Reg to reg
-            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
-            inst_fmath(ext);
-            return;
-            
-        case 1: // unused
-            _throw_illegal_instruction();
-            return;
-            
-        case 2: // Memory->reg & movec
-            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
-            inst_fmath(ext);
-            return;
-            
-        case 3: // reg->mem
-            fpu->fpiar = shoe.orig_pc; // fmove() can throw an exception
-            inst_fmove(ext);
-            return;
-            
-        case 4: // mem -> sys ctl registers
-        case 5: // sys ctl registers -> mem
-            // fmovem_control() cannot throw an FPU exception (don't modify fpiar)
-            inst_fmovem_control(ext);
-            return;
-            
-        case 6: // movem to fp registers
-        case 7: // movem to memory
-            // fmovem() cannot throw an FPU exception (don't modify fpiar)
-            inst_fmovem(ext);
-            return;
-    }
-    
-    assert(0); // never get here
-    return;
-}
-
-#pragma mark FPU-state initialization and reset
-
-void fpu_initialize()
-{
-    fpu_state_t *fpu = (fpu_state_t*)p_alloc(shoe.pool, sizeof(fpu_state_t));
-    memset(fpu, sizeof(fpu_state_t), 0);
-    shoe.fpu_state = fpu;
-}
-
-void fpu_reset()
-{
-    p_free(shoe.fpu_state);
-    fpu_initialize();
-}
diff --git a/core/oldfpu.c b/core/oldfpu.c
new file mode 100644
index 0000000..03b4e73
--- /dev/null
+++ b/core/oldfpu.c
@@ -0,0 +1,1539 @@
+/*
+ * Copyright (c) 2013, Peter Rutenbar <pruten@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <fenv.h>
+#include <float.h>
+#include <math.h>
+#include "../core/shoebill.h"
+
+extern struct dis_t dis;
+extern uint16_t dis_op;
+
+#define FPU_JUMP_EMU 0
+#define FPU_JUMP_DIS 1
+typedef void (fpu_func_t)(uint16_t, uint16_t);
+
+typedef struct {
+    fpu_func_t *emu, *dis;
+    const char *name;
+} fpu_inst_t;
+
+~newmacro(create_fpu_jump_table, 0, {
+    my $names = [
+        'unknown', 'fabs', 'facos', 'fadd', 'fasin', 'fatan', 'fatanh', 'fbcc', 'fcmp', 'fcos', 'fcosh',
+        'fdbcc', 'fdiv', 'fetox', 'fetoxm1', 'fgetexp', 'fgetman', 'fint', 'fintrz', 'flog10', 'flog2',
+        'flogn', 'flognp1', 'fmod', 'fmove', 'fmovecr', 'fmovem', 'fmovem_control', 'fmul', 'fneg', 'fnop',
+        'frem', 'frestore', 'fsave', 'fscale', 'fscc', 'fsgldiv', 'fsglmul', 'fsin', 'fsincos', 'fsinh',
+        'fsqrt', 'fsub', 'ftan', 'ftanh', 'ftentox', 'ftrapcc', 'ftst', 'ftwotox'
+    ];
+    my $fpu_enum = "typedef enum {\n";
+    foreach my $n (@$names) {
+        $fpu_enum .= "\tfpu_inst_$n,\n";
+    }
+    $fpu_enum .= "\nfpu_inst_max} fpu_inst_name_t;";
+    
+    my $fpu_table = "fpu_inst_t fpu_inst_table[fpu_inst_max] = {\n";
+    foreach my $n (@$names) {
+        $fpu_table .= "\t{NULL, NULL, \"" . $n . "\"},\n";
+    }
+    $fpu_table = substr($fpu_table, 0, -2);
+    $fpu_table .= "\n};";
+    
+    my $out = "$fpu_enum \n $fpu_table \n";
+    return $out;
+})
+
+~create_fpu_jump_table()
+
+static fpu_inst_name_t fpu_decode_op(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 ttt MMMMMM);
+    
+    if (t) {
+        switch (t) {
+            case 1:
+                if ((M>>3) == 1)
+                    return fpu_inst_fdbcc;
+                else if ((M>>3) == 7)
+                    return fpu_inst_ftrapcc;
+                return fpu_inst_fscc;
+            case 2:
+                if (M==0 && ext == 0)
+                    return fpu_inst_fnop; // same as fbf.w
+                // fall through
+            case 3:
+                return fpu_inst_fbcc;
+            case 4:
+                return fpu_inst_fsave;
+            case 5:
+                return fpu_inst_frestore;
+        }
+        return fpu_inst_unknown;
+    }
+    
+    ~decompose(ext, ccc xxx yyy eeeeeee)
+    
+    switch (c) {
+        case 0: // Reg to reg
+            break;
+        case 1: // unused
+            return fpu_inst_unknown;
+        case 2: // Memory->reg & movec
+            break;
+            
+        case 3: // reg->mem
+            return fpu_inst_fmove;
+            
+        case 4: // mem -> sys ctl registers
+        case 5: // sys ctl registers -> mem
+            return fpu_inst_fmovem_control;
+            
+        case 6: // movem to fp registers
+        case 7: // movem to memory
+            return fpu_inst_fmovem;
+    }
+    
+    // Here c == 0b000 or 010
+    
+    if (M == 0 && ~bmatch(ext, 010 111 xxx xxxxxxx))
+        return fpu_inst_fmovecr;
+    
+    if ((e>>3) == ~b(0110))
+        return fpu_inst_fsincos;
+    
+    switch (e) {
+            
+        case ~b(0000000):
+        case ~b(1000000):
+        case ~b(1000100):
+            return fpu_inst_fmove;
+            
+        case ~b(0000001): return fpu_inst_fint;
+        case ~b(0000010): return fpu_inst_fsinh;
+        case ~b(0000011): return fpu_inst_fintrz;
+        case ~b(0000110): return fpu_inst_flognp1;
+        case ~b(0001000): return fpu_inst_fetoxm1;
+        case ~b(0001001): return fpu_inst_ftanh;
+        case ~b(0001010): return fpu_inst_fatan;
+        case ~b(0001100): return fpu_inst_fasin;
+        case ~b(0001101): return fpu_inst_fatanh;
+        case ~b(0001110): return fpu_inst_fsin;
+        case ~b(0001111): return fpu_inst_ftan;
+        case ~b(0010000): return fpu_inst_fetox;
+        case ~b(0010001): return fpu_inst_ftwotox;
+        case ~b(0010010): return fpu_inst_ftentox;
+        case ~b(0010100): return fpu_inst_flogn;
+        case ~b(0010101): return fpu_inst_flog10;
+        case ~b(0010110): return fpu_inst_flog2;
+        case ~b(0011001): return fpu_inst_fcosh;
+        case ~b(0011100): return fpu_inst_facos;
+        case ~b(0011101): return fpu_inst_fcos;
+        case ~b(0011110): return fpu_inst_fgetexp;
+        case ~b(0011111): return fpu_inst_fgetman;
+        case ~b(0100001): return fpu_inst_fmod;
+        case ~b(0100100): return fpu_inst_fsgldiv;
+        case ~b(0100111): return fpu_inst_fsglmul;
+        case ~b(0100101): return fpu_inst_frem;
+        case ~b(0100110): return fpu_inst_fscale;
+        case ~b(0111000): return fpu_inst_fcmp;
+        case ~b(0111010): return fpu_inst_ftst;
+            
+        case ~b(0011000):
+        case ~b(1011000):
+        case ~b(1011100):
+            return fpu_inst_fabs;
+            
+        case ~b(0100010):
+        case ~b(1100010):
+        case ~b(1100110):
+            return fpu_inst_fadd;
+            
+        case ~b(0100000):
+        case ~b(1100000):
+        case ~b(1100100):
+            return fpu_inst_fdiv;
+            
+            
+        case ~b(0100011):
+        case ~b(1100011):
+        case ~b(1100111):
+            return fpu_inst_fmul;
+            
+        case ~b(0011010):
+        case ~b(1011010):
+        case ~b(1011110):
+            return fpu_inst_fneg;
+            
+        case ~b(0000100):
+        case ~b(1000001):
+        case ~b(1000101):
+            return fpu_inst_fsqrt;
+            
+        case ~b(0101000):
+        case ~b(1101000):
+        case ~b(1101100):
+            return fpu_inst_fsub;
+    }
+    
+    return fpu_inst_unknown;
+    
+}
+
+#define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
+#define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
+
+void dis_fpu_decode ()
+{
+    ~decompose(dis_op, 1111 001 xxx 000000);
+    
+    fpu_inst_name_t name;
+    uint16_t ext = 0;
+    
+    if (x == 4)
+        name = fpu_inst_fsave;
+    else if (x == 5)
+        name = fpu_inst_frestore;
+    else {
+        ext = dis_next_word();
+        name = fpu_decode_op(dis_op, ext);
+    }
+    
+    if (fpu_inst_table[name].dis) {
+        (*fpu_inst_table[name].dis)(dis_op, ext);
+        return ;
+    }
+    
+    sprintf(dis.str, "%s ???", fpu_inst_table[name].name);
+}
+
+void inst_fpu_decode ()
+{
+    ~decompose(shoe.op, 1111 001 xxx 000000);
+    
+    fpu_inst_name_t name;
+    uint16_t ext = 0;
+    
+    if (x == 4)
+        name = fpu_inst_fsave;
+    else if (x == 5)
+        name = fpu_inst_frestore;
+    else {
+        ext = nextword();
+        name = fpu_decode_op(shoe.op, ext);
+        // "For FPCP instructions that generate FPU exceptions,
+        //  FPIAR is loaded with the address of an instruction before it's executed,
+        //  unless all arithmetic exceptions are disabled."
+        // My take: set fpiar for all instructions except fsave, frestore, and fmovem_control
+        if (name != fpu_inst_fmovem_control)
+            shoe.fpiar = shoe.orig_pc;
+    }
+    
+    if (fpu_inst_table[name].emu) {
+        (*fpu_inst_table[name].emu)(shoe.op, ext);
+        return ;
+    }
+    
+    slog("inst_fpu_decode: unhandled instruction: %s op=0x%04x ext = 0x%04x pc=0x%08x\n", fpu_inst_table[name].name, shoe.op, ext, shoe.orig_pc);
+    assert(!"unknown fpu inst");
+    //dbg_state.running = 0;
+    
+}
+
+
+void dis_fsave(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 100 MMMMMM);
+    ~decompose(op, 1111 001 100 mmmrrr);
+    
+    if (m == 4)
+        sprintf(dis.str, "fsave -(a%u)", r);
+    else
+        sprintf(dis.str, "fsave %s", decode_ea_addr(M));
+}
+
+void inst_fsave(uint16_t op, uint16_t ext)
+{
+    verify_supervisor();
+    
+    ~decompose(op, 1111 001 100 MMMMMM);
+    ~decompose(op, 1111 001 100 mmmrrr);
+    
+    const uint32_t size = 0x1c; // IDLE frame
+    const uint16_t frame_header = 0xfd18;
+    uint32_t addr;
+    
+    if (m == 4)
+        addr = shoe.a[r] - size;
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    lset(addr, 2, frame_header);
+    if (shoe.abort)
+        return ;
+    
+    if (m == 4)
+        shoe.a[r] = addr;
+}
+
+void dis_frestore(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 101 MMMMMM);
+    ~decompose(op, 1111 001 101 mmmrrr);
+    
+    if (m == 3)
+        sprintf(dis.str, "frestore (a%u)+", r);
+    else
+        sprintf(dis.str, "frestore %s", decode_ea_addr(M));
+}
+
+void inst_frestore(uint16_t op, uint16_t ext)
+{
+    verify_supervisor();
+    
+    ~decompose(op, 1111 001 101 MMMMMM);
+    ~decompose(op, 1111 001 101 mmmrrr);
+    
+    uint32_t addr, size;
+    
+    if (m == 3)
+        addr = shoe.a[r];
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    const uint16_t word = lget(addr, 2);
+    if (shoe.abort) return ;
+    
+    // XXX: These frame sizes are different on 68881/68882/68040
+    if ((word & 0xff00) == 0x0000)
+        size = 4; // NULL state frame
+    else if ((word & 0xff) == 0x0018)
+        size = 0x1c; // IDLE state frame
+    else if ((word & 0xff) == 0x00b4)
+        size = 0xb8; // BUSY state frame
+    else {
+        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
+        assert("inst_frestore: bad state frame");
+        return ;
+    }
+    
+    if (m==3) {
+        shoe.a[r] += size;
+        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
+    }
+}
+
+typedef struct {
+    uint8_t inexact;
+    uint8_t dat[4][12];
+} fmovecr_t;
+
+fmovecr_t fmovecr_pi = {1, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, };
+fmovecr_t fmovecr_log10_2 = {1, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x99, };
+fmovecr_t fmovecr_e = {1, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9b, };
+fmovecr_t fmovecr_log2_e = {1, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, };
+fmovecr_t fmovecr_log10_e = {0, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, };
+fmovecr_t fmovecr_zero = {0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_ln_2 = {1, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, };
+fmovecr_t fmovecr_ln_10 = {1, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, };
+fmovecr_t fmovecr_10_0 = {0, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_1 = {0, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_2 = {0, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_4 = {0, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_8 = {0, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_16 = {0, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_32 = {1, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, };
+fmovecr_t fmovecr_10_64 = {1, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd6, };
+fmovecr_t fmovecr_10_128 = {1, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, };
+fmovecr_t fmovecr_10_256 = {1, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, };
+fmovecr_t fmovecr_10_512 = {1, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, };
+fmovecr_t fmovecr_10_1024 = {1, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x18, };
+fmovecr_t fmovecr_10_2048 = {1, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe6, };
+fmovecr_t fmovecr_10_4096 = {1, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, };
+
+const int _fpu_round_map[4] = {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
+#define fpu_set_round() assert(0 == fesetround(_fpu_round_map[shoe.fpcr.b.mc_rnd]))
+#define fpu_reset_round() assert(0 == fesetround(FE_TONEAREST))
+
+static void fpu_set_cc(long double f)
+{
+    // Set condition codes
+    shoe.fpsr.raw &= 0x00ffffff;
+    shoe.fpsr.b.cc_nan = (0 != isnan(f));
+    if (!shoe.fpsr.b.cc_nan) {
+        shoe.fpsr.b.cc_n = (0 != signbit(f));
+        if (isinf(f))
+            shoe.fpsr.b.cc_i = 1;
+        else
+            shoe.fpsr.b.cc_z = (f == 0.0);
+    }
+}
+
+static long double fpu_set_reg(long double f, uint8_t r)
+{
+    // Round the number according to the mode control byte
+    {
+        fpu_set_round();
+        
+        if (shoe.fpcr.b.mc_prec == 1) {
+            const float tmp = (float)f;
+            f = tmp;
+        } else if (shoe.fpcr.b.mc_prec == 2) {
+            const double tmp = (double)f;
+            f = tmp;
+        }
+        
+        fpu_reset_round();
+    }
+    
+    // Store it
+    shoe.fp[r] = f;
+    return f;
+}
+
+// fpu_set_reg_cc() and fpu_set_ea() set the condition codes. (what else should they set?)
+static void fpu_set_reg_cc(long double f, uint8_t r)
+{
+    fpu_set_cc(fpu_set_reg(f, r));
+}
+
+static void x87_to_motorola(long double x87, uint8_t motorola[12])
+{
+    uint8_t *x87_ptr = (uint8_t*)&x87;
+    motorola[0] = x87_ptr[9];
+    motorola[1] = x87_ptr[8];
+    motorola[2] = 0;
+    motorola[3] = 0;
+    motorola[4] = x87_ptr[7];
+    motorola[5] = x87_ptr[6];
+    motorola[6] = x87_ptr[5];
+    motorola[7] = x87_ptr[4];
+    motorola[8] = x87_ptr[3];
+    motorola[9] = x87_ptr[2];
+    motorola[10] = x87_ptr[1];
+    motorola[11] = x87_ptr[0];
+}
+
+static long double motorola_to_x87(const uint8_t motorola[12])
+{
+    uint8_t x87[12];
+    
+    x87[11] = 0;
+    x87[10] = 0;
+    x87[9] = motorola[0];
+    x87[8] = motorola[1];
+    
+    x87[7] = motorola[4];
+    x87[6] = motorola[5];
+    x87[5] = motorola[6];
+    x87[4] = motorola[7];
+    x87[3] = motorola[8];
+    x87[2] = motorola[9];
+    x87[1] = motorola[10];
+    x87[0] = motorola[11];
+    return *(long double*)&x87[0];
+}
+
+void inst_fmovecr(uint16_t op, uint16_t ext)
+{
+    ~decompose(ext, 010111 rrr xxxxxxx);
+    
+    fmovecr_t *c = &fmovecr_zero;
+    
+    switch (x) {
+        case 0x00: c = &fmovecr_pi; break;
+        case 0x0b: c = &fmovecr_log10_2; break;
+        case 0x0c: c = &fmovecr_e; break;
+        case 0x0d: c = &fmovecr_log2_e; break;
+        case 0x0e: c = &fmovecr_log10_e; break;
+        case 0x0f: c = &fmovecr_zero; break;
+        case 0x30: c = &fmovecr_ln_2; break;
+        case 0x31: c = &fmovecr_ln_10; break;
+        case 0x32: c = &fmovecr_10_0; break;
+        case 0x33: c = &fmovecr_10_1; break;
+        case 0x34: c = &fmovecr_10_2; break;
+        case 0x35: c = &fmovecr_10_4; break;
+        case 0x36: c = &fmovecr_10_8; break;
+        case 0x37: c = &fmovecr_10_16; break;
+        case 0x38: c = &fmovecr_10_32; break;
+        case 0x39: c = &fmovecr_10_64; break;
+        case 0x3a: c = &fmovecr_10_128; break;
+        case 0x3b: c = &fmovecr_10_256; break;
+        case 0x3c: c = &fmovecr_10_512; break;
+        case 0x3d: c = &fmovecr_10_1024; break;
+        case 0x3e: c = &fmovecr_10_2048; break;
+        case 0x3f: c = &fmovecr_10_4096; break;
+    }
+    
+    // The constants in the 68881's ROM must be in the "intermediate" format, because they're rounded differently based on fpcr.rnd
+    const long double f = motorola_to_x87(c->dat[shoe.fpcr.b.mc_rnd]);
+    
+    fpu_set_reg_cc(f, r);
+    
+    slog("inst_fmovecr: set fp%u=%.30Lg\n", r, shoe.fp[r]);
+    
+    // fpu_finalize_exceptions();
+}
+
+void dis_fmovecr(uint16_t op, uint16_t ext)
+{
+    ~decompose(ext, 010111 rrr xxxxxxx);
+    
+    sprintf(dis.str, "fmovecr.x 0x%02x,fp%u", x, r);
+}
+
+void inst_fmovem_control(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 10d CSI 0000000000);
+    
+    const uint32_t count = C + S + I;
+    const uint32_t size = count * 4;
+    
+    if (count == 0) // I don't know if this is even a valid instruction
+        return ;
+    
+    if ((m == 0 || m == 1) && (count > 1)) { // data and addr reg modes are valid, but only if count==1
+        throw_illegal_instruction();
+        return ;
+    }
+    
+    uint32_t addr, buf[3];
+    uint32_t i;
+    
+    if (d) { // reg to memory
+        i=0;
+        if (C) buf[i++] = shoe.fpcr.raw;
+        if (S) buf[i++] = shoe.fpsr.raw;
+        if (I) buf[i++] = shoe.fpiar;
+
+        if (m == 0) {
+            shoe.d[r] = buf[0];
+            return ;
+        }
+        else if (m == 1) {
+            shoe.a[r] = buf[0];
+            return ;
+        }
+        else if (m == 3)
+            addr = shoe.a[r];
+        else if (m == 4)
+            addr = shoe.a[r] - size;
+        else {
+            call_ea_addr(M);
+            addr = shoe.dat;
+        }
+        
+        for (i=0; i<count; i++) {
+            lset(addr + (i*4), 4, buf[i]);
+            if (shoe.abort)
+                return ;
+        }
+    }
+    else { // mem to reg
+        if (m == 0) // data reg
+            buf[0] = shoe.d[r];
+        else if (m == 1) // addr reg
+            buf[0] = shoe.a[r];
+        else {
+            if (m == 3) // post-increment
+                addr = shoe.a[r];
+            else if (m == 4) // pre-decrement
+                addr = shoe.a[r] - size;
+            else if (M == 0x3c) // immediate
+                addr = shoe.pc;
+            else {
+                call_ea_addr(M); // call_ea_addr() should work for all other modes
+                addr = shoe.dat;
+            }
+            
+            for (i=0; i<count; i++) {
+                buf[i] = lget(addr + (i*4), 4);
+                if (shoe.abort)
+                    return ;
+            }
+        }
+        
+        i = 0;
+        
+        if (C) {
+            uint8_t round = shoe.fpcr.b.mc_rnd;
+            shoe.fpcr.raw = buf[i++];
+            uint8_t newround = shoe.fpcr.b.mc_rnd;
+            
+            if (round != newround) {
+                slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
+            }
+        }
+        if (S) shoe.fpsr.raw = buf[i++];
+        if (I) shoe.fpiar = buf[i++];
+        
+        // Commit immediate-EA-mode PC change
+        if (M == 0x3c)
+            shoe.pc += size;
+    }
+        
+    // Commit pre/post-inc/decrement
+    
+    if (m == 3)
+        shoe.a[r] += size;
+    if (m == 4)
+        shoe.a[r] -= size;
+    
+    
+    
+    slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
+}
+
+void dis_fmovem_control(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 10d CSI 0000000000);
+    
+    if (d)
+        sprintf(dis.str, "fmovem.l [%u%u%u],%s\n", C, S, I, decode_ea_addr(M)); // <- XXX: decode_ea_addr() is the wrong function to use
+    else
+        sprintf(dis.str, "fmovem.l %s,[%u%u%u]\n", decode_ea_addr(M), C, S, I); // <- XXX: decode_ea_addr() is the wrong function to use
+}
+
+
+static uint8_t fpu_test_cc(uint8_t cc)
+{
+    const uint8_t z = shoe.fpsr.b.cc_z;
+    const uint8_t n = shoe.fpsr.b.cc_n;
+    const uint8_t nan = shoe.fpsr.b.cc_nan;
+    
+    switch (cc & 0x0f) {
+        case 0: // false
+            return 0;
+        case 1: // equal
+            return z;
+        case 2: // greater than
+            return !(nan || z || n);
+        case 3: // greater than or equal
+            return z || !(nan || n);
+        case 4: // less than
+            return n && !(nan || z);
+        case 5: // less than or equal
+            return z || (n && !nan);
+        case 6: // greater or less than
+            return !(nan || z);
+        case 7: // ordered
+            return !nan;
+        case 8: // unordered
+            return nan;
+        case 9: // not (greater or less than)
+            return nan || z;
+        case 10: // not (less than or equal)
+            return nan || !(n || z);
+        case 11: // not (less than)
+            return nan || (z || !n);
+        case 12: // not (greater than or equal)
+            return nan || (n && !z);
+        case 13: // not (greater than)
+            return nan || z || n;
+        case 14: // not equal
+            return !z;
+        case 15: // true
+            return 1;
+    }
+    
+    assert(0);
+    return 0;
+}
+
+void inst_fbcc(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
+    
+    uint32_t displacement;
+    if (s) {
+        const uint16_t ext2 = nextword();
+        displacement = (ext << 16) | ext2;
+    }
+    else {
+        const int16_t tmp = ext;
+        const int32_t tmp2 = tmp;
+        displacement = tmp2;
+    }
+    
+    if (b) {
+        slog("inst_fbcc: fixme: Got a CC that wants to set BSUN, not implemented\n");
+        //assert(0); // FIXME: implement BSUN, or uncomment this
+    }
+    
+    if (fpu_test_cc(c)) {
+        const uint32_t addr = shoe.orig_pc + 2 + displacement;
+        shoe.pc = addr;
+    }
+}
+
+const char *fpu_cc_names[32] = {
+    "f", "eq", "ogt", "oge", "olt", "ole", "ogl", "or",
+    "un", "ueq", "ugt", "uge", "ult", "ule", "ne", "t",
+    "sf", "seq", "gt", "ge", "lt", "le", "gl", "gle",
+    "ngle", "ngl", "nle", "nlt", "nge", "ngt", "sne", "st"
+};
+
+void dis_fbcc(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 01 s 0ccccc); // only the low 5 bits of cc are significant
+    
+    uint32_t displacement;
+    if (s) {
+        const uint16_t ext2 = dis_next_word();
+        displacement = (ext << 16) | ext2;
+    }
+    else {
+        const int16_t tmp = ext;
+        const int32_t tmp2 = tmp;
+        displacement = tmp2;
+    }
+    
+    const uint32_t addr = dis.orig_pc + 2 + displacement;
+    
+    sprintf(dis.str, "fb%s.%c *0x%08x", fpu_cc_names[c], "wl"[s], addr);
+}
+
+static void reverse_order(uint8_t *buf, const uint32_t size)
+{
+    uint32_t i;
+    for (i=0; i < (size/2); i++) {
+        const uint8_t tmp = buf[i];
+        buf[i] = buf[size-(1+i)];
+        buf[size-(1+i)] = tmp;
+    }
+}
+
+void inst_fmovem(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
+    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
+    
+    const uint8_t pre_mask = s ? shoe.d[y] : L; // pre-adjusted mask
+    
+    // Count the number of bits in the mask
+    uint32_t count, maskcpy = pre_mask;
+    for (count=0; maskcpy; maskcpy >>= 1)
+        count += (maskcpy & 1);
+    
+    const uint32_t size = count * 12;
+    
+    // for predecrement mode, the mask is reversed
+    uint8_t mask = 0;
+    if (m == 4) {
+        uint32_t i;
+        for (i=0; i < 8; i++) {
+            const uint8_t bit = (pre_mask << i) & 0x80;
+            mask = (mask >> 1) | bit;
+        }
+    }
+    else
+        mask = pre_mask;
+    
+    uint32_t i, addr;
+    
+    // Find the EA
+    if (m == 3) {
+        addr = shoe.a[r];
+        assert(p); // assert post-increment mask
+    }
+    else if (m == 4) {
+        addr = shoe.a[r] - size;
+        assert(!p); // assert pre-decrement mask
+    }
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+        assert(p); // assert post-increment mask
+    }
+    
+    slog("inst_fmovem: pre=%08x mask=%08x EA=%u/%u addr=0x%08x size=%u %s\n", pre_mask, mask, m, r, addr, size, d?"to mem":"from mem");
+    
+    if (d) {
+        // Write those registers
+        for (i=0; i<8; i++) {
+            if (!(mask & (0x80 >> i)))
+                continue;
+            
+            uint8_t buf[12];
+            x87_to_motorola(shoe.fp[i], buf);
+            
+            slog("inst_fmovem: writing %Lf from fp%u", shoe.fp[i], i);
+            uint32_t j;
+            for (j=0; j<12; j++) {
+                slog(" %02x", buf[j]);
+                lset(addr, 1, buf[j]);
+                addr++;
+                if (shoe.abort)
+                    return ;
+            }
+            slog("\n");
+        }
+    }
+    else {
+        // Read those registers
+        for (i=0; i<8; i++) {
+            if (!(mask & (0x80 >> i)))
+                continue;
+            
+            uint8_t buf[12];
+            uint32_t j;
+            for (j=0; j<12; j++) {
+                buf[j] = lget(addr, 1);
+                addr++;
+                if (shoe.abort)
+                    return ;
+            }
+            shoe.fp[i] = motorola_to_x87(buf);
+            
+            slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
+        }
+    }
+    
+    // Commit the write for pre/post-inc/decrement
+    if (m == 3)
+        shoe.a[r] += size;
+    else if (m == 4)
+        shoe.a[r] -= size;
+    
+    //slog("inst_fmovem: notice: not implemented (EA = %u/%u, mask=0x%02x)\n", m, r, mask);
+    
+}
+
+void dis_fmovem(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
+    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
+    
+    sprintf(dis.str, "fmovem ???");
+}
+
+enum {
+    format_L = 0,
+    format_S = 1,
+    format_X = 2,
+    format_Ps = 3,
+    format_W = 4,
+    format_D = 5,
+    format_B = 6,
+    format_Pd = 7
+} fpu_formats;
+/*
+ * 0 L     long word integer
+ * 1 S     single precision real
+ * 2 X     extended precision real
+ * 3 P{#k} packed decimal real with static k factor
+ * 4 W     word integer
+ * 5 D     double precision real
+ * 6 B     byte integer
+ * 7 P{Dn} packed decimal real with dynamic k factor
+ */
+
+static void fpu_read_ea_commit(uint8_t mr, uint8_t format)
+{
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    
+    if (m == 3)
+        shoe.a[r] += sizes[format];
+    else if (m == 4)
+        shoe.a[r] -= sizes[format];
+}
+
+// Note: fpu_read_ea modifies shoe.pc, fpu_read_ea_commit modies shoe.a[r] for pre/post-inc/decrement
+static long double fpu_read_ea(uint8_t mr, uint8_t format)
+{
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    
+    long double data, result;
+    uint32_t addr;
+    
+    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
+    if ((m == 1) ||
+        ((m == 0) && (sizes[format] > 4))) {
+        throw_illegal_instruction();
+        return 0.0;
+    }
+    
+    switch (m) {
+        case 0: {
+            if (format == format_S) {
+                float tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_B) {
+                int8_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_W) {
+                int16_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_L) {
+                int32_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            
+            goto got_data;
+        }
+            
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && sizes[format]==1));
+            goto got_address;
+            
+        case 4:
+            addr = shoe.a[r] - sizes[format];
+            assert(!( r==7 && sizes[format]==1));
+            goto got_address;
+            
+        case 7:
+            if (r == 4) {
+                addr = shoe.pc;
+                shoe.pc += sizes[format];
+                goto got_address;
+            }
+            
+            // fall through to default:
+            
+        default: {
+            
+            shoe.mr=mr;
+            ea_addr();
+            if (shoe.abort)
+                return 0.0;
+            
+            addr = (uint32_t)shoe.dat;
+            goto got_address;
+        }
+    }
+    
+got_address:
+    
+    {
+        uint8_t buf[12];
+        uint8_t *ptr = &buf[sizes[format]];
+        uint32_t i;
+        
+        slog("inst_f fpu_read_ea: format=%u, data =", format);
+        for (i=0; i<sizes[format]; i++) {
+            ptr--;
+            *ptr = lget(addr+i, 1);
+            slog(" %02x", *ptr);
+            if (shoe.abort)
+                return 0.0;
+        }
+        
+        switch (format) {
+            case format_B: {
+                int8_t tmp = ptr[0];
+                data = tmp;
+                break;
+            }
+            case format_W: {
+                int16_t tmp = *(int16_t*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_L: {
+                int32_t tmp = *(int32_t*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_S: {
+                float tmp = *(float*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_D: {
+                double tmp = *(double*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_X: {
+                reverse_order(ptr, 12);
+                data = motorola_to_x87(ptr);
+                break;
+            }
+            default: {
+                assert(!"unsupported format (packed something)");
+            }
+        }
+    }
+    
+got_data:
+    
+    fpu_set_round();
+    result = data;
+    fpu_reset_round();
+    slog(" data=%Lf result=%Lf\n", data, result);
+    return result;
+}
+
+
+static void fpu_write_ea(uint8_t mr, uint8_t format, long double orig_data)
+{
+    fpu_set_round();
+    const long double data = orig_data;
+    fpu_reset_round();
+    
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    uint8_t buf[12], *ptr = &buf[0];
+    uint32_t addr, i;
+    
+    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
+    if ((m == 1) ||
+        ((m == 0) && (sizes[format] > 4))) {
+        throw_illegal_instruction();
+        return ;
+    }
+    
+    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, data, format);
+    
+    // Convert to the appropriate format
+    
+    switch (format) {
+        case format_B: {
+            int8_t tmp = data;
+            *((int8_t*)ptr) = tmp;
+            goto write_to_mem;
+        }
+        case format_W: {
+            int16_t tmp = data;
+            *((int16_t*)ptr) = tmp;
+            slog("inst_f fpu_write_ea formatted=%u (0x%04x)\n", *((int16_t*)ptr), *((uint16_t*)ptr));
+            break;
+        }
+        case format_L: {
+            int32_t tmp = data;
+            *((int32_t*)ptr) = tmp;
+            break;
+        }
+        case format_S: {
+            float tmp = data;
+            *((float*)ptr) = tmp;
+            break;
+        }
+        case format_D: {
+            double tmp = data;
+            *((double*)ptr) = tmp;
+            break;
+        }
+        case format_X: {
+            x87_to_motorola(data, ptr);
+            goto write_to_mem; // ptr is already big endian
+        }
+        default: {
+            assert(!"unsupported format (packed something)");
+        }
+    }
+
+swap_order:
+    reverse_order(buf, sizes[format]);
+    
+    
+write_to_mem:
+    // Lookup the EA
+
+    switch (m) {
+        case 0: {
+            if (format == format_B) {
+                int8_t tmp = data;
+                set_d(r, tmp, 1);
+            }
+            else if (format == format_W) {
+                int16_t tmp = data;
+                set_d(r, tmp, 2);
+            }
+            else if (format == format_L) {
+                int32_t tmp = data;
+                shoe.d[r] = tmp;
+            }
+            else if (format == format_S) {
+                float tmp = data;
+                *((float*)&shoe.d[r]) = tmp;
+            }
+            
+            goto done;
+        }
+        case 3: // post-increment
+            addr = shoe.a[r];
+            assert(!( r==7 && sizes[format]==1));
+            break;
+        case 4: // pre-decrement
+            addr = shoe.a[r] - sizes[format];
+            assert(!( r==7 && sizes[format]==1));
+            break;
+        default:
+            call_ea_addr(mr);
+            addr = (uint32_t)shoe.dat;
+            break;
+    }
+    
+    // Copy the formatted data into the EA
+    slog("inst_f fpu_write_ea: addr=0x%08x\n", addr);
+    for (i=0; i < sizes[format]; i++) {
+        lset(addr + i, 1, buf[i]);
+        if (shoe.abort)
+            return ;
+    }
+
+done: // set condition codes and update pre/post-inc/decrement registers
+    
+    // Set condition codes
+    shoe.fpsr.raw &= 0x00ffffff;
+    shoe.fpsr.b.cc_nan = (0 != isnan(data));
+    if (!shoe.fpsr.b.cc_nan) {
+        shoe.fpsr.b.cc_n = (0 != signbit(data));
+        if (isinf(data))
+            shoe.fpsr.b.cc_i = 1;
+        else
+            shoe.fpsr.b.cc_z = (data == 0.0);
+    }
+    
+    if (m == 3)
+        shoe.a[r] += sizes[format];
+    else if (m == 4)
+        shoe.a[r] -= sizes[format];
+}
+
+void inst_fmove(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 0 E V aaa zzz KKKKKKK);
+    
+    const uint8_t format = a;
+    
+    if (K == ~b(1000100) || K == ~b(1000000)) {
+        assert(!"inst_fmove: This is either a K-value, or somebody called fmove and specified the secret precision bits");
+    }
+    
+    // E==0 => Don't use EA (reg->reg)
+    // E==1 => Use EA
+    // V==0 => reg->reg or mem->reg
+    // V==1 => reg->mem
+    
+    // Load the source value into 'data'
+    
+    long double data;
+    
+    if (E && !V) { // mem -> reg
+        data = fpu_read_ea(M, format);
+        if (shoe.abort)
+            return ;
+    }
+    else if (!E) { // reg -> mem
+        data = shoe.fp[a];
+    }
+    else { // reg -> reg
+        data = shoe.fp[z];
+    }
+    
+    
+    // XXX: Check for exceptions?
+    
+    // Write the result
+    
+    if (E && V) { // reg -> mem
+        fpu_write_ea(M, format, data);
+        if (shoe.abort)
+            return ;
+    }
+    else if (!V) { // mem -> reg
+        fpu_set_reg_cc(data, z);
+        fpu_read_ea_commit(M, format);
+    }
+    else { // reg -> reg
+        fpu_set_reg_cc(data, z);
+    }
+    
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    slog("inst_fmove src=%Lf size=%u a=%u z=%u to-mem=%u useEA=%u EA = %u/%u\n", data, sizes[format], a, z, V, E, m, r);
+}
+
+void dis_fnop(uint16_t op, uint16_t ext)
+{
+    sprintf(dis.str, "fnop");
+}
+
+void inst_fnop(uint16_t op, uint16_t ext)
+{
+}
+
+void dis_fmove(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 0 E V aaa bbb KKKKKKK);
+    
+    // E==0 => reg to reg
+    // E==1 => mem to reg / reg to mem
+    // V==0 => reg->reg or mem->reg
+    // V==1 => reg->mem
+    
+    
+    sprintf(dis.str, "fmove ???");
+    
+}
+
+void dis_fmath(uint16_t op, uint16_t ext)
+{
+    sprintf(dis.str, "fmath ??");
+}
+
+static void fpu_set_fpsr_quotient(long double a, long double b, long double result)
+{
+    // Thanks for being super vague on the meaning of this register, 68881 documentation
+    
+    const long double quo = truncl((a - result) / b);
+    const uint8_t sign = signbit(quo);
+    const uint64_t quo_int = fabsl(quo);
+    
+    shoe.fpsr.b.qu_quotient = quo_int & 0x7f;
+    shoe.fpsr.b.qu_s = sign;
+}
+
+void inst_fmath(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    uint8_t do_write_back_result = 1;
+    
+    long double source, dest, result;
+    
+    if (src_in_ea) {
+        source = fpu_read_ea(M, source_specifier);
+        slog("inst_fmath: source = %u/%u = %Lf", M>>3, M&7, source);
+        if ((M>>3) == 3)
+            slog(" a[%u]=0x%08x", M&7, shoe.a[M&7]);
+        
+        if (shoe.abort)
+            return ;
+    }
+    else {
+        source = shoe.fp[source_specifier];
+        slog("inst_fmath: source = fp%u = %Lf", source_specifier, source);
+    }
+    
+    dest = shoe.fp[dest_register];
+    slog("  dest = fp%u = %Lf\n", dest_register, dest);
+    
+    switch (e) {
+        case ~b(0000001): {// fpu_inst_fint
+            const uint8_t dir = shoe.fpcr.b.mc_rnd;
+            
+            // {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
+            
+            if (dir == 0)
+                result = roundl(source);
+            else if (dir == 1)
+                result = truncl(source);
+            else if (dir == 2)
+                result = floorl(source);
+            else
+                result = ceill(source);
+            
+            slog("inst_fint: source = %Lf result = %Lf round=%u\n", source, result, dir);
+            
+            break;
+        }
+        case ~b(0000010): assert(!"fpu_inst_fsinh;");
+        case ~b(0000011): // fpu_inst_fintrz
+            slog("inst_fintrz dest = %Lf source = %Lf\n", dest, source);
+            result = truncl(source);
+            break;
+            
+        case ~b(0000110): // flognp1
+            slog("inst_flognp1 dest = %Lf source = %Lf\n", dest, source);
+            assert(source > -1.0);
+            result = log1pl(source);
+            break;
+        case ~b(0001000): assert(!"fpu_inst_fetoxm1;");
+        case ~b(0001001): assert(!"fpu_inst_ftanh;");
+        case ~b(0001010): // fatan
+            slog("inst_fatan dest = %Lf source = %Lf\n", dest, source);
+            result = atanl(source);
+            break;
+            
+        case ~b(0001100): assert(!"fpu_inst_fasin;");
+        case ~b(0001101): assert(!"fpu_inst_fatanh;");
+        case ~b(0001110): // fsin
+            slog("inst_fsin dest = %Lf source = %Lf\n", dest, source);
+            result = sinl(source);
+            break;
+        case ~b(0001111): assert(!"fpu_inst_ftan;");
+        case ~b(0010000): // fetox
+            slog("inst_fetox dest = %Lf source = %Lf\n", dest, source);
+            result = expl(source);
+            break;
+        case ~b(0010001): assert(!"fpu_inst_ftwotox;");
+        case ~b(0010010): assert(!"fpu_inst_ftentox;");
+        case ~b(0010100): assert(!"fpu_inst_flogn;");
+        case ~b(0010101): assert(!"fpu_inst_flog10;");
+        case ~b(0010110): assert(!"fpu_inst_flog2;");
+        case ~b(0011001): assert(!"fpu_inst_fcosh;");
+        case ~b(0011100): assert(!"fpu_inst_facos;");
+        case ~b(0011101): // fcos
+            slog("fpu_inst_fcos dest = %Lf source = %Lf\n", dest, source);
+            result = cosl(source);
+            break;
+            
+        case ~b(0011110): {// fpu_inst_fgetexp
+            if (!((source > 0) || (source < 0)))
+                result = source; // positive or negative zero
+            else if (!isfinite(source)) {
+                assert(!"fgetexp: isinfl(source)");
+                // returns NAN and an exception bit - not implemented for the moment
+            }
+            else {
+                // Extract the debiased exponent from the 80-bit source
+                uint8_t motorola[12];
+                x87_to_motorola(source, motorola);
+                int32_t exp = (motorola[0] & 0x7f) << 8;
+                exp |= motorola[1];
+                exp -= 16383; // debias
+                result = exp;
+            }
+            break;
+        }
+        case ~b(0011111): assert(!"fpu_inst_fgetman;");
+        case ~b(0100001):
+            // don't forget to set fpu_set_fpsr_quotient();
+            assert(!"fpu_inst_fmod;");
+        
+        case ~b(0100100): assert(!"fpu_inst_fsgldiv");
+            
+        case ~b(0100101): { // fpu_inst_frem
+            assert(source != 0.0);
+            result = remainderl(dest, source);
+            fpu_set_fpsr_quotient(dest, source, result);
+            slog("inst_frem: dest = %Lf source = %Lf quot = %u result = %Lf\n", dest, source, shoe.fpsr.b.qu_quotient, result);
+            break;
+        }
+        case ~b(0100110): assert(!"fpu_inst_fscale;");
+            
+        case ~b(0111000): { // fpu_inst_fcmp
+            const long double diff = dest - source;
+            slog("inst_fcmp: dest = %Lf source = %Lf\n", dest, source);
+            fpu_set_cc(diff);
+            do_write_back_result = 0; // don't write result back to register
+            break;
+        }
+        case ~b(0111010): { // fpu_inst_ftst
+            slog("fpu_inst_ftst: dest = %Lf\n");
+            fpu_set_cc(source);
+            do_write_back_result = 0; // don't write result back to register
+            break;
+        }
+            
+        case ~b(1011100):
+        case ~b(1011000):
+            assert(!"inst_fabs: can't handle");
+        case ~b(0011000):// fpu_inst_fabs
+            result = fabsl(source);
+            slog("inst_fabs: source=%Lf result=%Lf\n", source, result);
+            break;
+            
+        case ~b(1100010):
+        case ~b(1100110):
+            assert(!"can't handle");
+        case ~b(0100010): { // fpu_inst_fadd
+            slog("inst_fadd dest = %Lf source = %Lf\n", dest, source);
+            result = dest + source;
+            break;
+        }
+            
+        case ~b(1100000):
+        case ~b(1100100):
+            assert(!"can't handle");
+        case ~b(0100000): { // fpu_inst_fdiv
+            assert(source != 0.0);
+            slog("inst_fdiv dest = %Lf source = %Lf\n", dest, source);
+            
+            result = dest / source;
+            break;
+        }
+            
+            
+        case ~b(1100011):
+        case ~b(1100111):
+            assert(!"can't handle");
+        case ~b(0100011): { // fpu_inst_fmul
+            slog("inst_fmul dest = %Lf source = %Lf\n", dest, source);
+            result = source * dest;
+            break;
+        }
+            
+        case ~b(1011010):
+        case ~b(1011110):
+            assert(!"fneg: can't handle");
+        case ~b(0011010): // fneg
+            slog("inst_fneg dest = %Lf source = %Lf\n", dest, source);
+            result = -source;
+            break;
+            
+        case ~b(1000001):
+        case ~b(1000101):
+            assert(!"can't handle");
+        case ~b(0000100): { // fpu_inst_fsqrt
+            slog("inst_fsqrt dest = %Lf source = %Lf\n", dest, source);
+            result = sqrtl(source);
+            break;
+        }
+            
+        case ~b(1101000):
+        case ~b(1101100):
+            assert(!"can't handle");
+        case ~b(0101000): { // fpu_inst_fsub
+            slog("inst_fsub dest = %Lf source = %Lf\n", dest, source);
+            result = dest - source;
+            break;
+        }
+            
+        case ~b(0110000) ... ~b(0110111):
+            assert(!"fpu_inst_fsincos;");
+        
+        default:
+            assert(!"inst_fmath: unknown instruction");
+    }
+    
+    // Finalize the read, if source was in memory
+    if (src_in_ea) {
+        fpu_read_ea_commit(M, source_specifier);
+    }
+    
+    // Only write back the result if necessary (fcmp doesn't do this)
+    if (do_write_back_result) {
+        slog("inst_fmath: result = %Lf\n", result);
+        fpu_set_reg_cc(result, dest_register);
+    }
+}
+
+
+
+
+// Setup the jump table for fpu instructions
+// XXX: come up with a better, unified system for decoding instructions
+void fpu_setup_jump_table()
+{
+    uint32_t i;
+    
+    
+    fpu_inst_table[fpu_inst_fnop].emu = inst_fnop;
+    fpu_inst_table[fpu_inst_fnop].dis = dis_fnop;
+    
+    fpu_inst_table[fpu_inst_fbcc].emu = inst_fbcc;
+    fpu_inst_table[fpu_inst_fbcc].dis = dis_fbcc;
+    
+    fpu_inst_table[fpu_inst_fmovecr].emu = inst_fmovecr;
+    fpu_inst_table[fpu_inst_fmovecr].dis = dis_fmovecr;
+    
+    fpu_inst_table[fpu_inst_fmove].emu = inst_fmove;
+    fpu_inst_table[fpu_inst_fmove].dis = dis_fmove;
+    
+    fpu_inst_table[fpu_inst_fmovem].emu = inst_fmovem;
+    fpu_inst_table[fpu_inst_fmovem].dis = dis_fmovem;
+    
+    fpu_inst_table[fpu_inst_fmovem_control].emu = inst_fmovem_control;
+    fpu_inst_table[fpu_inst_fmovem_control].dis = dis_fmovem_control;
+    
+    fpu_inst_table[fpu_inst_frestore].emu = inst_frestore;
+    fpu_inst_table[fpu_inst_frestore].dis = dis_frestore;
+
+    fpu_inst_table[fpu_inst_fsave].emu = inst_fsave;
+    fpu_inst_table[fpu_inst_fsave].dis = dis_fsave;
+    
+    const fpu_inst_name_t _fmath[] = {
+        fpu_inst_fsincos,
+        fpu_inst_fint,
+        fpu_inst_fsinh,
+        fpu_inst_fintrz,
+        fpu_inst_flognp1,
+        fpu_inst_fetoxm1,
+        fpu_inst_ftanh,
+        fpu_inst_fatan,
+        fpu_inst_fatanh,
+        fpu_inst_fsin,
+        fpu_inst_ftan,
+        fpu_inst_fetox,
+        fpu_inst_ftwotox,
+        fpu_inst_ftentox,
+        fpu_inst_flogn,
+        fpu_inst_flog10,
+        fpu_inst_flog2,
+        fpu_inst_fcosh,
+        fpu_inst_facos,
+        fpu_inst_fcos,
+        fpu_inst_fgetexp,
+        fpu_inst_fgetman,
+        fpu_inst_fmod,
+        fpu_inst_fsgldiv,
+        fpu_inst_fsglmul,
+        fpu_inst_frem,
+        fpu_inst_fscale,
+        fpu_inst_fcmp,
+        fpu_inst_ftst,
+        fpu_inst_fabs,
+        fpu_inst_fadd,
+        fpu_inst_fdiv,
+        fpu_inst_fmul,
+        fpu_inst_fneg,
+        fpu_inst_fsqrt,
+        fpu_inst_fsub
+    };
+    
+    for (i=0; i < sizeof(_fmath) / sizeof(fpu_inst_name_t); i++) {
+        fpu_inst_table[_fmath[i]].emu = inst_fmath;
+        fpu_inst_table[_fmath[i]].dis = dis_fmath;
+    }
+}
+
+
+