diff --git a/core/Makefile b/core/Makefile
index 975d695..cd2e969 100644
--- a/core/Makefile
+++ b/core/Makefile
@@ -1,85 +1,18 @@
 
 CC = clang
-CFLAGS = -O3 -ggdb -flto -Wno-deprecated-declarations
-# CFLAGS = -O0 -ggdb -Wno-deprecated-declarations
 
+all: shoebill
 
-DEPS = mc68851.h shoebill.h Makefile macro.pl
-NEED_DECODER = cpu dis
-NEED_PREPROCESSING = adb mc68851 mem via floppy core_api newfpu
-NEED_NOTHING = atrap_tab coff exception macii_symbols redblack scsi video filesystem alloc_pool toby_frame_buffer sound ethernet SoftFloat/softfloat
+shoebill: make_gui debugger
 
-# Object files that can be compiled directly from the source
-OBJ_NEED_NOTHING = $(patsubst %,$(TEMP)/%.o,$(NEED_NOTHING))
+make_gui: make_core
+	xcodebuild -project gui/Shoebill.xcodeproj SYMROOT=build
 
-# Object files than need preprocessing with macro.pl
-OBJ_NEED_PREPROCESSING = $(patsubst %,$(TEMP)/%.o,$(NEED_PREPROCESSING))
+debugger: make_core
+	$(MAKE) -C debugger
 
-# Object files that depend on the instruction decoder
-OBJ_NEED_DECODER = $(patsubst %,$(TEMP)/%.o,$(NEED_DECODER))
-
-# Files that NEED_DECODER also NEED_PREPROCESSING
-POST_PREPROCESSING = $(patsubst %,$(TEMP)/%.post.c,$(NEED_PREPROCESSING)) $(patsubst %,$(TEMP)/%.post.c,$(NEED_DECODER))
-
-
-
-# All the object files compiled for x86_64
-OBJ_x86_64 = $(OBJ_NEED_NOTHING) $(OBJ_NEED_PREPROCESSING) $(OBJ_NEED_DECODER)
-
-# The object files compiled for i386 (the same as x86_64 files, but with .i386 appended)
-OBJ_i386 = $(patsubst %,%.i386,$(OBJ_x86_64))
-
-
-MACRO = perl macro.pl
-
-TEMP = ../intermediates
-
-
-all: $(TEMP)/libshoebill_core.a 
-
-$(TEMP)/libshoebill_core.a: $(TEMP) $(DEPS) $(OBJ_x86_64)
-	libtool -static -v -o $(TEMP)/libshoebill_core.a.x86_64 $(OBJ_x86_64)
-	libtool -static -v -o $(TEMP)/libshoebill_core.a.i386 $(OBJ_i386)
-	lipo -create -output $(TEMP)/libshoebill_core.a $(TEMP)/libshoebill_core.a.x86_64 $(TEMP)/libshoebill_core.a.i386
-
-
-# Split object files into i386/x86_64 versions, since it seems that libtool is unable to 
-# link a static universal library for -O4 object files.
-# x86_64 object files have the form "intermediates/<file_name>.o
-# i386 object files have the form "intermediates/<file_name>.o.i386
-
-# Build object files
-$(OBJ_NEED_NOTHING): $(TEMP)/%.o: %.c $(DEPS)
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-$(OBJ_NEED_PREPROCESSING): $(TEMP)/%.o: $(TEMP)/%.post.c $(DEPS)
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-$(OBJ_NEED_DECODER): $(TEMP)/%.o: $(TEMP)/%.post.c $(DEPS) $(TEMP)/dis_decoder_guts.c $(TEMP)/inst_decoder_guts.c
-	$(CC) -c -arch x86_64 $(CFLAGS) $< -o $@
-	$(CC) -c -arch i386 $(CFLAGS) $< -o $@.i386
-
-# Preprocess C files 
-$(POST_PREPROCESSING): $(TEMP)/%.post.c: %.c $(DEPS)
-	$(MACRO) $< $@
-
-# Generate instruction decoders
-$(TEMP)/inst_decoder_guts.c: $(TEMP)/decoder_gen $(DEPS)
-	$(TEMP)/decoder_gen inst $(TEMP)/
-$(TEMP)/dis_decoder_guts.c: $(TEMP)/decoder_gen $(DEPS)
-	$(TEMP)/decoder_gen dis $(TEMP)/
-
-# Compile the decoder generator
-$(TEMP)/decoder_gen: decoder_gen.c $(DEPS)
-	$(CC) decoder_gen.c -o $(TEMP)/decoder_gen
-
-
-$(TEMP):
-	mkdir -p $(TEMP)
-	mkdir -p $(TEMP)/SoftFloat
+make_core:
+	$(MAKE) -C core -j 4
 
 clean:
-	rm -rf $(TEMP)
-
+	rm -rf intermediates gui/build
diff --git a/core/fpu.c b/core/fpu.c
index 03b4e73..da7e072 100644
--- a/core/fpu.c
+++ b/core/fpu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Peter Rutenbar <pruten@gmail.com>
+ * Copyright (c) 2013-2014, Peter Rutenbar <pruten@gmail.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,525 +23,2600 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
-#include <fenv.h>
-#include <float.h>
-#include <math.h>
 #include "../core/shoebill.h"
+#include "../core/SoftFloat/softfloat.h"
 
-extern struct dis_t dis;
-extern uint16_t dis_op;
+#pragma mark Structures and macros
+
+// Mode control byte
+#define mc_rnd  (fpu->fpcr.b._mc_rnd)
+#define mc_prec (fpu->fpcr.b._mc_prec)
+
+// Exception enable byte
+#define ee_inex1 (fpu->fpcr.b._ee_inex1)
+#define ee_inex2 (fpu->fpcr.b._ee_inex2)
+#define ee_dz    (fpu->fpcr.b._ee_dz)
+#define ee_unfl  (fpu->fpcr.b._ee_unfl)
+#define ee_ovfl  (fpu->fpcr.b._ee_ovfl)
+#define ee_operr (fpu->fpcr.b._ee_operr)
+#define ee_snan  (fpu->fpcr.b._ee_snan)
+#define ee_bsun  (fpu->fpcr.b._ee_bsun)
+
+// Accrued exception byte
+#define ae_inex (fpu->fpsr.b._ae_inex)
+#define ae_dz   (fpu->fpsr.b._ae_dz)
+#define ae_unfl (fpu->fpsr.b._ae_unfl)
+#define ae_ovfl (fpu->fpsr.b._ae_ovfl)
+#define ae_iop  (fpu->fpsr.b._ae_iop)
+
+// Exception status byte
+#define es_inex1 (fpu->fpsr.b._es_inex1)
+#define es_inex2 (fpu->fpsr.b._es_inex2)
+#define es_dz    (fpu->fpsr.b._es_dz)
+#define es_unfl  (fpu->fpsr.b._es_unfl)
+#define es_ovfl  (fpu->fpsr.b._es_ovfl)
+#define es_operr (fpu->fpsr.b._es_operr)
+#define es_snan  (fpu->fpsr.b._es_snan)
+#define es_bsun  (fpu->fpsr.b._es_bsun)
+
+// Quotient byte
+#define qu_quotient (fpu->fpsr.b._qu_quotient)
+#define qu_s        (fpu->fpsr.b._qu_s) /* quotient sign */
+
+// Condition codes
+#define cc_nan (fpu->fpsr.b._cc_nan)
+#define cc_i (fpu->fpsr.b._cc_i)
+#define cc_z (fpu->fpsr.b._cc_z)
+#define cc_n (fpu->fpsr.b._cc_n)
 
-#define FPU_JUMP_EMU 0
-#define FPU_JUMP_DIS 1
-typedef void (fpu_func_t)(uint16_t, uint16_t);
 
 typedef struct {
-    fpu_func_t *emu, *dis;
-    const char *name;
-} fpu_inst_t;
+    uint32_t fpiar; // FPU iaddr
+    
+    union { // fpcr, fpu control register
+        struct {
+            // Mode control byte
+            uint16_t _mc_zero : 4; // zero/dummy
+            uint16_t _mc_rnd  : 2; // rounding mode
+            uint16_t _mc_prec : 2; // rounding precision
+            // Exception enable byte
+            uint16_t _ee_inex1 : 1; // inexact decimal input
+            uint16_t _ee_inex2 : 1; // inxact operation
+            uint16_t _ee_dz    : 1; // divide by zero
+            uint16_t _ee_unfl  : 1; // underflow
+            uint16_t _ee_ovfl  : 1; // overflow
+            uint16_t _ee_operr : 1; // operand error
+            uint16_t _ee_snan  : 1; // signalling not a number
+            uint16_t _ee_bsun  : 1; // branch/set on unordered
+        } b;
+        
+        uint16_t raw;
+    } fpcr;
+    
+    union { // fpsr, fpu status register
+        struct {
+            // Accrued exception byte
+            uint32_t _dummy1  : 3; // dummy/zero
+            uint32_t _ae_inex : 1; // inexact
+            uint32_t _ae_dz   : 1; // divide by zero
+            uint32_t _ae_unfl : 1; // underflow
+            uint32_t _ae_ovfl : 1; // overflow
+            uint32_t _ae_iop  : 1; // invalid operation
+            // Exception status byte
+            uint32_t _es_inex1 : 1; // inexact decimal input
+            uint32_t _es_inex2 : 1; // inxact operation
+            uint32_t _es_dz    : 1; // divide by zero
+            uint32_t _es_unfl  : 1; // underflow
+            uint32_t _es_ovfl  : 1; // overflow
+            uint32_t _es_operr : 1; // operand error
+            uint32_t _es_snan  : 1; // signalling not a number
+            uint32_t _es_bsun  : 1; // branch/set on unordered
+            // Quotient byte
+            uint32_t _qu_quotient : 7;
+            uint32_t _qu_s        : 1;
+            // Condition code byte
+            uint32_t _cc_nan  : 1; // not a number
+            uint32_t _cc_i    : 1; // infinity
+            uint32_t _cc_z    : 1; // zero
+            uint32_t _cc_n    : 1; // negative
+            uint32_t _dummy2  : 4; // dummy/zero
+        } b;
+        uint32_t raw;
+    } fpsr;
+    
+    floatx80 fp[8]; // 80 bit floating point general registers
+    
+    // State for the static fmath instruction implementations
+    float128 source, dest, result;
+    _Bool write_back;
+    uint8_t fmath_op;
+} fpu_state_t;
 
-~newmacro(create_fpu_jump_table, 0, {
-    my $names = [
-        'unknown', 'fabs', 'facos', 'fadd', 'fasin', 'fatan', 'fatanh', 'fbcc', 'fcmp', 'fcos', 'fcosh',
-        'fdbcc', 'fdiv', 'fetox', 'fetoxm1', 'fgetexp', 'fgetman', 'fint', 'fintrz', 'flog10', 'flog2',
-        'flogn', 'flognp1', 'fmod', 'fmove', 'fmovecr', 'fmovem', 'fmovem_control', 'fmul', 'fneg', 'fnop',
-        'frem', 'frestore', 'fsave', 'fscale', 'fscc', 'fsgldiv', 'fsglmul', 'fsin', 'fsincos', 'fsinh',
-        'fsqrt', 'fsub', 'ftan', 'ftanh', 'ftentox', 'ftrapcc', 'ftst', 'ftwotox'
-    ];
-    my $fpu_enum = "typedef enum {\n";
-    foreach my $n (@$names) {
-        $fpu_enum .= "\tfpu_inst_$n,\n";
-    }
-    $fpu_enum .= "\nfpu_inst_max} fpu_inst_name_t;";
-    
-    my $fpu_table = "fpu_inst_t fpu_inst_table[fpu_inst_max] = {\n";
-    foreach my $n (@$names) {
-        $fpu_table .= "\t{NULL, NULL, \"" . $n . "\"},\n";
-    }
-    $fpu_table = substr($fpu_table, 0, -2);
-    $fpu_table .= "\n};";
-    
-    my $out = "$fpu_enum \n $fpu_table \n";
-    return $out;
-})
+enum rounding_precision_t {
+    prec_extended = 0,
+    prec_single = 1,
+    prec_double = 2,
+};
 
-~create_fpu_jump_table()
+enum rounding_mode_t {
+    mode_nearest = 0,
+    mode_zero = 1,
+    mode_neg = 2,
+    mode_pos = 3
+};
 
-static fpu_inst_name_t fpu_decode_op(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 ttt MMMMMM);
-    
-    if (t) {
-        switch (t) {
-            case 1:
-                if ((M>>3) == 1)
-                    return fpu_inst_fdbcc;
-                else if ((M>>3) == 7)
-                    return fpu_inst_ftrapcc;
-                return fpu_inst_fscc;
-            case 2:
-                if (M==0 && ext == 0)
-                    return fpu_inst_fnop; // same as fbf.w
-                // fall through
-            case 3:
-                return fpu_inst_fbcc;
-            case 4:
-                return fpu_inst_fsave;
-            case 5:
-                return fpu_inst_frestore;
-        }
-        return fpu_inst_unknown;
-    }
-    
-    ~decompose(ext, ccc xxx yyy eeeeeee)
-    
-    switch (c) {
-        case 0: // Reg to reg
-            break;
-        case 1: // unused
-            return fpu_inst_unknown;
-        case 2: // Memory->reg & movec
-            break;
-            
-        case 3: // reg->mem
-            return fpu_inst_fmove;
-            
-        case 4: // mem -> sys ctl registers
-        case 5: // sys ctl registers -> mem
-            return fpu_inst_fmovem_control;
-            
-        case 6: // movem to fp registers
-        case 7: // movem to memory
-            return fpu_inst_fmovem;
-    }
-    
-    // Here c == 0b000 or 010
-    
-    if (M == 0 && ~bmatch(ext, 010 111 xxx xxxxxxx))
-        return fpu_inst_fmovecr;
-    
-    if ((e>>3) == ~b(0110))
-        return fpu_inst_fsincos;
-    
-    switch (e) {
-            
-        case ~b(0000000):
-        case ~b(1000000):
-        case ~b(1000100):
-            return fpu_inst_fmove;
-            
-        case ~b(0000001): return fpu_inst_fint;
-        case ~b(0000010): return fpu_inst_fsinh;
-        case ~b(0000011): return fpu_inst_fintrz;
-        case ~b(0000110): return fpu_inst_flognp1;
-        case ~b(0001000): return fpu_inst_fetoxm1;
-        case ~b(0001001): return fpu_inst_ftanh;
-        case ~b(0001010): return fpu_inst_fatan;
-        case ~b(0001100): return fpu_inst_fasin;
-        case ~b(0001101): return fpu_inst_fatanh;
-        case ~b(0001110): return fpu_inst_fsin;
-        case ~b(0001111): return fpu_inst_ftan;
-        case ~b(0010000): return fpu_inst_fetox;
-        case ~b(0010001): return fpu_inst_ftwotox;
-        case ~b(0010010): return fpu_inst_ftentox;
-        case ~b(0010100): return fpu_inst_flogn;
-        case ~b(0010101): return fpu_inst_flog10;
-        case ~b(0010110): return fpu_inst_flog2;
-        case ~b(0011001): return fpu_inst_fcosh;
-        case ~b(0011100): return fpu_inst_facos;
-        case ~b(0011101): return fpu_inst_fcos;
-        case ~b(0011110): return fpu_inst_fgetexp;
-        case ~b(0011111): return fpu_inst_fgetman;
-        case ~b(0100001): return fpu_inst_fmod;
-        case ~b(0100100): return fpu_inst_fsgldiv;
-        case ~b(0100111): return fpu_inst_fsglmul;
-        case ~b(0100101): return fpu_inst_frem;
-        case ~b(0100110): return fpu_inst_fscale;
-        case ~b(0111000): return fpu_inst_fcmp;
-        case ~b(0111010): return fpu_inst_ftst;
-            
-        case ~b(0011000):
-        case ~b(1011000):
-        case ~b(1011100):
-            return fpu_inst_fabs;
-            
-        case ~b(0100010):
-        case ~b(1100010):
-        case ~b(1100110):
-            return fpu_inst_fadd;
-            
-        case ~b(0100000):
-        case ~b(1100000):
-        case ~b(1100100):
-            return fpu_inst_fdiv;
-            
-            
-        case ~b(0100011):
-        case ~b(1100011):
-        case ~b(1100111):
-            return fpu_inst_fmul;
-            
-        case ~b(0011010):
-        case ~b(1011010):
-        case ~b(1011110):
-            return fpu_inst_fneg;
-            
-        case ~b(0000100):
-        case ~b(1000001):
-        case ~b(1000101):
-            return fpu_inst_fsqrt;
-            
-        case ~b(0101000):
-        case ~b(1101000):
-        case ~b(1101100):
-            return fpu_inst_fsub;
-    }
-    
-    return fpu_inst_unknown;
-    
-}
+/*
+ * 0 L     long word integer
+ * 1 S     single precision real
+ * 2 X     extended precision real
+ * 3 P{#k} packed decimal real with static k factor
+ * 4 W     word integer
+ * 5 D     double precision real
+ * 6 B     byte integer
+ * 7 P{Dn} packed decimal real with dynamic k factor
+ */
+static const uint8_t _format_sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+enum {
+    format_L = 0,
+    format_S = 1,
+    format_X = 2,
+    format_Ps = 3,
+    format_W = 4,
+    format_D = 5,
+    format_B = 6,
+    format_Pd = 7
+} fpu_formats;
 
+#define fpu_get_state_ptr() fpu_state_t *fpu = (fpu_state_t*)shoe.fpu_state
 #define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
+#define nextlong() ({const uint32_t L=lget(shoe.pc,4); if (shoe.abort) {return;}; shoe.pc+=4; L;})
 #define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
 
-void dis_fpu_decode ()
+#pragma mark FPU exception stuff
+enum fpu_vector_t {
+    fpu_vector_ftrapcc = 7,
+    fpu_vector_fline = 11,
+    fpu_vector_coprocessor_protocol_violation = 13, // won't be using this one
+    fpu_vector_bsun = 48,
+    fpu_vector_inexact = 49,
+    fpu_vector_divide_by_zero = 50,
+    fpu_vector_underflow = 51,
+    fpu_vector_operr = 52,
+    fpu_vector_overflow = 53,
+    fpu_vector_snan = 54
+};
+
+/*
+ * Map the exception bit positions (in fpsr and fpcr)
+ * to their corresponding exception vector numbers.
+ */
+const uint8_t _exception_bit_to_vector[8] = {
+    48, // bsun
+    54, // snan
+    52, // operr
+    53, // ovfl
+    51, // unfl
+    50, // dz
+    49, // inex2
+    49, // inex1
+};
+
+static void throw_fpu_pre_instruction_exception(enum fpu_vector_t vector)
 {
-    ~decompose(dis_op, 1111 001 xxx 000000);
+    throw_frame_zero(shoe.orig_sr, shoe.orig_pc, vector);
+}
+/*
+ * Note: I may be able to get away without implementing the
+ *       mid-instruction exception.
+ */
+
+/*
+ * _bsun_test() is called by every inst_f*cc instruction
+ * to test whether the bsun exception is enabled, throw an
+ * exception if so, and otherwise just set the appropriate
+ * bit in fpsr, and update the accrued exception byte.
+ */
+static _Bool _bsun_test()
+{
+    fpu_get_state_ptr();
     
-    fpu_inst_name_t name;
-    uint16_t ext = 0;
+    // BSUN counts against the IOP accrued exception bit
+    ae_iop = 1;
     
-    if (x == 4)
-        name = fpu_inst_fsave;
-    else if (x == 5)
-        name = fpu_inst_frestore;
-    else {
-        ext = dis_next_word();
-        name = fpu_decode_op(dis_op, ext);
+    // Set the BSUN exception status bit
+    es_bsun = 1;
+    
+    // If the BSUN exception isn't enabled, then we can just return
+    if (!ee_bsun)
+        return 0; // 0 -> elected not to throw an exception
+    
+    throw_fpu_pre_instruction_exception(fpu_vector_bsun);
+    return 1;
+}
+
+static void _throw_illegal_instruction()
+{
+    assert(!"throw_illegal_instruction!");
+}
+
+#pragma mark Float format translators (to/from big-endian motorola format)
+
+static void _floatx80_to_int8(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = tmp & 0xff;
+}
+
+static void _floatx80_to_int16(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = (tmp >> 8) & 0xff;
+    ptr[1] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_int32(floatx80 *f, uint8_t *ptr)
+{
+    uint32_t tmp = floatx80_to_int32(*f);
+    ptr[0] = (tmp >> 24) & 0xff;
+    ptr[1] = (tmp >> 16) & 0xff;
+    ptr[2] = (tmp >> 8) & 0xff;
+    ptr[3] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_single(floatx80 *f, uint8_t *ptr)
+{
+    const float32 tmp = floatx80_to_float32(*f);
+    ptr[0] = (tmp >> 24) & 0xff;
+    ptr[1] = (tmp >> 16) & 0xff;
+    ptr[2] = (tmp >> 8) & 0xff;
+    ptr[3] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_double(floatx80 *f, uint8_t *ptr)
+{
+    const float64 tmp = floatx80_to_float64(*f);
+    ptr[0] = (tmp >> 56) & 0xff;
+    ptr[1] = (tmp >> 48) & 0xff;
+    ptr[2] = (tmp >> 40) & 0xff;
+    ptr[3] = (tmp >> 32) & 0xff;
+    ptr[4] = (tmp >> 24) & 0xff;
+    ptr[5] = (tmp >> 16) & 0xff;
+    ptr[6] = (tmp >> 8) & 0xff;
+    ptr[7] = (tmp >> 0) & 0xff;
+}
+
+static void _floatx80_to_extended(floatx80 *f, uint8_t *ptr)
+{
+    ptr[0] = (f->high >> 8) & 0xff;
+    ptr[1] = (f->high >> 0) & 0xff;
+    ptr[2] = 0;
+    ptr[3] = 0;
+    ptr[4] = (f->low >> 56) & 0xff;
+    ptr[5] = (f->low >> 48) & 0xff;
+    ptr[6] = (f->low >> 40) & 0xff;
+    ptr[7] = (f->low >> 32) & 0xff;
+    ptr[8] = (f->low >> 24) & 0xff;
+    ptr[9] = (f->low >> 16) & 0xff;
+    ptr[10] = (f->low >> 8) & 0xff;
+    ptr[11] = (f->low >> 0) & 0xff;
+}
+
+static float128 _int8_to_intermediate(int8_t byte)
+{
+    return int32_to_float128((int32_t)byte);
+}
+
+static float128 _int16_to_intermediate(int16_t sh)
+{
+    return int32_to_float128((int32_t)sh);
+}
+
+static float128 _int32_to_intermediate(int32_t in)
+{
+    return int32_to_float128(in);
+}
+
+static float128 _single_to_intermediate(uint32_t f)
+{
+    assert(sizeof(uint32_t) == sizeof(float32));
+    return float32_to_float128((float32)f);
+}
+
+/*
+ * _double_to_intermediate(d): d needs to be 68k-native order (8 bytes)
+ */
+static float128 _double_to_intermediate(uint8_t *d)
+{
+    assert(sizeof(uint64_t) == sizeof(float64));
+    
+    return float64_to_float128((float64) ntohll(*(uint64_t*)d));
+}
+
+/*
+ * _extended_to_intermediate(e): e needs to be 68k-native order (12 bytes)
+ */
+static float128 _extended_to_intermediate(uint8_t *e)
+{
+    /*
+     * softfloat floatx80 format:
+     * uint64_t low; // the low part of the extended float (significand, low exponent bits)
+     * uint16_t high; // the high part, sign, high exponent bits
+     */
+    floatx80 x80 = {
+        .high = (e[0] << 8) | e[1],
+        .low = ntohll(*(uint64_t*)&e[4])
+    };
+    return floatx80_to_float128(x80);
+}
+
+static void _extended_to_floatx80(uint8_t *bytes, floatx80 *f)
+{
+    f->high = (bytes[0] << 8) | bytes[1];
+    f->low = ntohll(*(uint64_t*)&bytes[4]);
+}
+
+/*
+ * Set softfloat's rounding mode
+ * (fpcr.mc_rnd and softfloat use different values for these modes)
+ */
+static void _set_rounding_mode(enum rounding_mode_t mode)
+{
+    const int8_t rounding_map[4] = {
+        float_round_nearest_even, float_round_to_zero,
+        float_round_up, float_round_down
+    };
+    
+    float_rounding_mode = rounding_map[mode];
+}
+
+#pragma mark EA routines
+
+/*
+ * Read-commit merely updates the address register
+ * for pre/post-inc/decrement
+ */
+static void _fpu_read_ea_commit(const uint8_t format)
+{
+    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
+    
+    if (m == 3) // post-increment
+        shoe.a[r] += _format_sizes[format];
+    else if (m == 4) // pre-decrement
+        shoe.a[r] -= _format_sizes[format];
+    
+    /* 
+     * Note: still unsure about what happens when
+     *       mode=pre/postincdecrement, size==1, and register==a7
+     *       (is the change +-2 bytes? or 1?)
+     */
+    if (((m == 3) || (m == 4)) && (_format_sizes[format] == 1) && (r == 7))
+        assert(!"size==1, reg==a7");
+}
+
+static void _fpu_write_ea(uint8_t mr, uint8_t format, floatx80 *f, uint8_t K)
+{
+    fpu_get_state_ptr();
+    
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t size = _format_sizes[format];
+    uint8_t buf[12], *ptr = &buf[0];
+    uint32_t addr, i;
+    
+    if ((m == 1) ||
+        ((m == 0) && (size > 4))) {
+        /* If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice */
+        _throw_illegal_instruction();
+        return ;
     }
-    
-    if (fpu_inst_table[name].dis) {
-        (*fpu_inst_table[name].dis)(dis_op, ext);
+    else if ((m == 7) && (r > 1)) {
+        /* If this is otherwise an illegal addr mode... */
+        _throw_illegal_instruction();
         return ;
     }
     
-    sprintf(dis.str, "%s ???", fpu_inst_table[name].name);
-}
-
-void inst_fpu_decode ()
-{
-    ~decompose(shoe.op, 1111 001 xxx 000000);
+    const _Bool is_nan = ((f->high << 1) == 0xfffe) && f->low;
     
-    fpu_inst_name_t name;
-    uint16_t ext = 0;
+    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, 666.0L, format);
     
-    if (x == 4)
-        name = fpu_inst_fsave;
-    else if (x == 5)
-        name = fpu_inst_frestore;
-    else {
-        ext = nextword();
-        name = fpu_decode_op(shoe.op, ext);
-        // "For FPCP instructions that generate FPU exceptions,
-        //  FPIAR is loaded with the address of an instruction before it's executed,
-        //  unless all arithmetic exceptions are disabled."
-        // My take: set fpiar for all instructions except fsave, frestore, and fmovem_control
-        if (name != fpu_inst_fmovem_control)
-            shoe.fpiar = shoe.orig_pc;
+    /* Initialize softfloat's exceptions bits/rounding mode */
+    
+    float_exception_flags = 0;
+    _set_rounding_mode(mc_rnd);
+    
+    /* Convert to the appropriate format */
+    
+    switch (format) {
+        case format_B: {
+            _floatx80_to_int8(f, ptr);
+            break;
+        }
+        case format_W: {
+            _floatx80_to_int16(f, ptr);
+            break;
+        }
+        case format_L: {
+            _floatx80_to_int32(f, ptr);
+            break;
+        }
+        case format_S: {
+            _floatx80_to_single(f, ptr);
+            break;
+        }
+        case format_D: {
+            _floatx80_to_double(f, ptr);
+            break;
+        }
+        case format_X: {
+            _floatx80_to_extended(f, ptr);
+            break;
+        }
+        default: {
+            assert(!"unsupported format (packed something!)");
+        }
     }
     
-    if (fpu_inst_table[name].emu) {
-        (*fpu_inst_table[name].emu)(shoe.op, ext);
-        return ;
+    /* Write to memory */
+    
+    switch (m) {
+        case 0: {
+            if (format == format_B)
+                set_d(r, ptr[0], 1);
+            else if (format == format_W)
+                set_d(r, ntohs(*(uint16_t*)ptr), 2);
+            else if ((format == format_L) || (format == format_S))
+                set_d(r, ntohl(*(uint32_t*)ptr), 4);
+            else
+                assert(!"how did I get here?");
+            goto done;
+        }
+        case 1:
+            assert(!"how did I get here again!");
+            
+        case 2:
+            addr = shoe.a[r];
+            break;
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && size==1));
+            break;
+        case 4: // pre-decrement
+            addr = shoe.a[r] - size;
+            assert(!( r==7 && size==1));
+            break;
+        default:
+            call_ea_addr(mr);
+            addr = (uint32_t)shoe.dat;
+            break;
     }
     
-    slog("inst_fpu_decode: unhandled instruction: %s op=0x%04x ext = 0x%04x pc=0x%08x\n", fpu_inst_table[name].name, shoe.op, ext, shoe.orig_pc);
-    assert(!"unknown fpu inst");
-    //dbg_state.running = 0;
+    /* Copy the formatted data into *addr */
     
-}
-
-
-void dis_fsave(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 100 MMMMMM);
-    ~decompose(op, 1111 001 100 mmmrrr);
+    slog("inst_f  fpu_write_ea: addr=0x08x data=0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+         addr,
+         ptr[0], ptr[1], ptr[2], ptr[3],
+         ptr[4], ptr[5], ptr[6], ptr[7],
+         ptr[8], ptr[9], ptr[10], ptr[11]);
     
-    if (m == 4)
-        sprintf(dis.str, "fsave -(a%u)", r);
-    else
-        sprintf(dis.str, "fsave %s", decode_ea_addr(M));
-}
-
-void inst_fsave(uint16_t op, uint16_t ext)
-{
-    verify_supervisor();
-    
-    ~decompose(op, 1111 001 100 MMMMMM);
-    ~decompose(op, 1111 001 100 mmmrrr);
-    
-    const uint32_t size = 0x1c; // IDLE frame
-    const uint16_t frame_header = 0xfd18;
-    uint32_t addr;
-    
-    if (m == 4)
-        addr = shoe.a[r] - size;
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
+    for (i=0; i<size; i++) {
+        lset(addr + i, 1, buf[i]);
+        if (shoe.abort) return ;
     }
     
-    lset(addr, 2, frame_header);
-    if (shoe.abort)
-        return ;
     
-    if (m == 4)
-        shoe.a[r] = addr;
-}
-
-void dis_frestore(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 101 MMMMMM);
-    ~decompose(op, 1111 001 101 mmmrrr);
+done:
+    /* 
+     * Set exception bits and update pre/post/blah registers.
+     * note: condition codes are not modified
+     */
     
-    if (m == 3)
-        sprintf(dis.str, "frestore (a%u)+", r);
-    else
-        sprintf(dis.str, "frestore %s", decode_ea_addr(M));
-}
-
-void inst_frestore(uint16_t op, uint16_t ext)
-{
-    verify_supervisor();
+    es_bsun = 0;
+    es_snan = 0;
+    es_operr = 0;
+    es_ovfl = 0;
+    es_unfl = 0;
+    es_dz = 0;
+    es_inex2 = 0;
+    es_inex1 = 0;
     
-    ~decompose(op, 1111 001 101 MMMMMM);
-    ~decompose(op, 1111 001 101 mmmrrr);
-    
-    uint32_t addr, size;
-    
-    if (m == 3)
-        addr = shoe.a[r];
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    const uint16_t word = lget(addr, 2);
-    if (shoe.abort) return ;
-    
-    // XXX: These frame sizes are different on 68881/68882/68040
-    if ((word & 0xff00) == 0x0000)
-        size = 4; // NULL state frame
-    else if ((word & 0xff) == 0x0018)
-        size = 0x1c; // IDLE state frame
-    else if ((word & 0xff) == 0x00b4)
-        size = 0xb8; // BUSY state frame
-    else {
-        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
-        assert("inst_frestore: bad state frame");
-        return ;
-    }
-    
-    if (m==3) {
-        shoe.a[r] += size;
-        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
-    }
-}
-
-typedef struct {
-    uint8_t inexact;
-    uint8_t dat[4][12];
-} fmovecr_t;
-
-fmovecr_t fmovecr_pi = {1, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, };
-fmovecr_t fmovecr_log10_2 = {1, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x99, };
-fmovecr_t fmovecr_e = {1, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9b, };
-fmovecr_t fmovecr_log2_e = {1, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, };
-fmovecr_t fmovecr_log10_e = {0, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, };
-fmovecr_t fmovecr_zero = {0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_ln_2 = {1, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, };
-fmovecr_t fmovecr_ln_10 = {1, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, };
-fmovecr_t fmovecr_10_0 = {0, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_1 = {0, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_2 = {0, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_4 = {0, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_8 = {0, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_16 = {0, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, };
-fmovecr_t fmovecr_10_32 = {1, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, };
-fmovecr_t fmovecr_10_64 = {1, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd6, };
-fmovecr_t fmovecr_10_128 = {1, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, };
-fmovecr_t fmovecr_10_256 = {1, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, };
-fmovecr_t fmovecr_10_512 = {1, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, };
-fmovecr_t fmovecr_10_1024 = {1, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x18, };
-fmovecr_t fmovecr_10_2048 = {1, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe6, };
-fmovecr_t fmovecr_10_4096 = {1, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, };
-
-const int _fpu_round_map[4] = {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
-#define fpu_set_round() assert(0 == fesetround(_fpu_round_map[shoe.fpcr.b.mc_rnd]))
-#define fpu_reset_round() assert(0 == fesetround(FE_TONEAREST))
-
-static void fpu_set_cc(long double f)
-{
-    // Set condition codes
-    shoe.fpsr.raw &= 0x00ffffff;
-    shoe.fpsr.b.cc_nan = (0 != isnan(f));
-    if (!shoe.fpsr.b.cc_nan) {
-        shoe.fpsr.b.cc_n = (0 != signbit(f));
-        if (isinf(f))
-            shoe.fpsr.b.cc_i = 1;
-        else
-            shoe.fpsr.b.cc_z = (f == 0.0);
-    }
-}
-
-static long double fpu_set_reg(long double f, uint8_t r)
-{
-    // Round the number according to the mode control byte
-    {
-        fpu_set_round();
+    switch (format) {
+        format_B:
+        format_W:
+        format_L:
+            /* Set snan, operr, and/or inex2 */
+            es_snan = is_nan;
+            es_operr = ((float_exception_flags & float_flag_invalid) != 0);
+            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
+            break;
         
-        if (shoe.fpcr.b.mc_prec == 1) {
-            const float tmp = (float)f;
-            f = tmp;
-        } else if (shoe.fpcr.b.mc_prec == 2) {
-            const double tmp = (double)f;
-            f = tmp;
+        format_S:
+        format_D:
+        format_X:
+            /* Set snan, ovfl, unfl, and/or inex2 */
+            es_snan = is_nan;
+            es_ovfl = ((float_exception_flags & float_flag_overflow) != 0);
+            es_unfl = ((float_exception_flags & float_flag_underflow) != 0);
+            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
+            break;
+            
+        format_Pd:
+        format_Ps:
+            /* Set snan, operr, and/or inex2 */
+            assert(!"you better implement packed formats");
+            break;
+    }
+    
+    /* Update the accrued exception bits */
+    ae_iop |= es_bsun | es_snan | es_operr;
+    ae_ovfl |= es_ovfl;
+    ae_unfl |= (es_unfl & es_inex2); // yes, &
+    ae_dz |= es_dz;
+    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
+    
+    /* Are any exceptions both set and enabled? */
+    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
+        /*
+         * Then we need to throw an exception.
+         * The exception is sent to the vector for
+         * the highest priority exception, and the priority
+         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
+         * (which is the order of the bits in fpsr/fpcr).
+         * Iterate over the bits in order, and throw the
+         * exception to whichever bit is set first.
+         */
+        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
+        
+        assert(throwable);
+        for (i=0; 1; i++) {
+            if (throwable & 0x80)
+                break;
+            throwable <<= 1;
         }
         
-        fpu_reset_round();
+        /*
+         * Convert the exception bit position
+         * to the correct vector number, and throw
+         * a (pre-instruction) exception.
+         */
+        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
+        
+        return ;
     }
     
-    // Store it
-    shoe.fp[r] = f;
-    return f;
+    /* Finalize registers, and we're done */
+    
+    if (m == 3)
+        shoe.a[r] += size;
+    else if (m == 4)
+        shoe.a[r] -= size;
 }
 
-// fpu_set_reg_cc() and fpu_set_ea() set the condition codes. (what else should they set?)
-static void fpu_set_reg_cc(long double f, uint8_t r)
+/*
+ * Note: fpu_read_ea modifies shoe.pc, and fpu_read_ea_commit
+ *        modifies shoe.a[x] for pre/post-inc/decrement
+ * Returns false if we're aborting
+ */
+static _Bool _fpu_read_ea(const uint8_t format, float128 *result)
 {
-    fpu_set_cc(fpu_set_reg(f, r));
-}
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
+    
+    const uint8_t size = _format_sizes[format];
+    uint32_t addr = 0;
+    
+    /*
+     * Step 1: find the effective address, store it in addr
+     *         (or the actual data, if unavailable)
+     */
+    
+    slog("FPU: read_ea: mr=%u%u f=%c ", m, r, "lsxpwdb?"[format]);
+    
+    switch (m) {
+        case 0:
+            if (format == format_S)
+                *result = _single_to_intermediate(shoe.d[r]);
+            else if (format == format_B)
+                *result = _int8_to_intermediate(shoe.d[r] & 0xff);
+            else if (format == format_W)
+                *result = _int16_to_intermediate(shoe.d[r] & 0xffff);
+            else if (format == format_L)
+                *result = int32_to_float128(shoe.d[r]);
+            else {
+                /*
+                 * No other format can be used with a data register
+                 * (because they require >4 bytes)
+                 */
+                _throw_illegal_instruction();
+                return 0;
+            }
+            slog("raw=0x%x", chop(shoe.d[r], size));
+            goto got_data;
+            
+        case 1:
+            /* Address regisers can't be used */
+            _throw_illegal_instruction();
+            return 0;
+        
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && size==1));
+            goto got_address;
+            
+        case 4:
+            addr = shoe.a[r] - size;
+            assert(!( r==7 && size==1));
+            goto got_address;
+            
+        case 7:
+            if (r == 4) {
+                addr = shoe.pc;
+                shoe.pc += size;
+                goto got_address;
+            }
+            
+            // fall through to default:
+            
+        default: {
+            ~decompose(shoe.op, 0000 0000 00 MMMMMM);
+            shoe.mr = M;
+            ea_addr();
+            if (shoe.abort)
+                return 0;
+            
+            addr = (uint32_t)shoe.dat;
+            goto got_address;
+        }
 
-static void x87_to_motorola(long double x87, uint8_t motorola[12])
-{
-    uint8_t *x87_ptr = (uint8_t*)&x87;
-    motorola[0] = x87_ptr[9];
-    motorola[1] = x87_ptr[8];
-    motorola[2] = 0;
-    motorola[3] = 0;
-    motorola[4] = x87_ptr[7];
-    motorola[5] = x87_ptr[6];
-    motorola[6] = x87_ptr[5];
-    motorola[7] = x87_ptr[4];
-    motorola[8] = x87_ptr[3];
-    motorola[9] = x87_ptr[2];
-    motorola[10] = x87_ptr[1];
-    motorola[11] = x87_ptr[0];
-}
-
-static long double motorola_to_x87(const uint8_t motorola[12])
-{
-    uint8_t x87[12];
-    
-    x87[11] = 0;
-    x87[10] = 0;
-    x87[9] = motorola[0];
-    x87[8] = motorola[1];
-    
-    x87[7] = motorola[4];
-    x87[6] = motorola[5];
-    x87[5] = motorola[6];
-    x87[4] = motorola[7];
-    x87[3] = motorola[8];
-    x87[2] = motorola[9];
-    x87[1] = motorola[10];
-    x87[0] = motorola[11];
-    return *(long double*)&x87[0];
-}
-
-void inst_fmovecr(uint16_t op, uint16_t ext)
-{
-    ~decompose(ext, 010111 rrr xxxxxxx);
-    
-    fmovecr_t *c = &fmovecr_zero;
-    
-    switch (x) {
-        case 0x00: c = &fmovecr_pi; break;
-        case 0x0b: c = &fmovecr_log10_2; break;
-        case 0x0c: c = &fmovecr_e; break;
-        case 0x0d: c = &fmovecr_log2_e; break;
-        case 0x0e: c = &fmovecr_log10_e; break;
-        case 0x0f: c = &fmovecr_zero; break;
-        case 0x30: c = &fmovecr_ln_2; break;
-        case 0x31: c = &fmovecr_ln_10; break;
-        case 0x32: c = &fmovecr_10_0; break;
-        case 0x33: c = &fmovecr_10_1; break;
-        case 0x34: c = &fmovecr_10_2; break;
-        case 0x35: c = &fmovecr_10_4; break;
-        case 0x36: c = &fmovecr_10_8; break;
-        case 0x37: c = &fmovecr_10_16; break;
-        case 0x38: c = &fmovecr_10_32; break;
-        case 0x39: c = &fmovecr_10_64; break;
-        case 0x3a: c = &fmovecr_10_128; break;
-        case 0x3b: c = &fmovecr_10_256; break;
-        case 0x3c: c = &fmovecr_10_512; break;
-        case 0x3d: c = &fmovecr_10_1024; break;
-        case 0x3e: c = &fmovecr_10_2048; break;
-        case 0x3f: c = &fmovecr_10_4096; break;
     }
     
-    // The constants in the 68881's ROM must be in the "intermediate" format, because they're rounded differently based on fpcr.rnd
-    const long double f = motorola_to_x87(c->dat[shoe.fpcr.b.mc_rnd]);
+got_address:
     
-    fpu_set_reg_cc(f, r);
+    /*
+     * Step 2: Load the data from the effective address
+     */
     
-    slog("inst_fmovecr: set fp%u=%.30Lg\n", r, shoe.fp[r]);
+    slog("raw=0x");
+    if (size <= 4) {
+        const uint32_t raw = lget(addr, size);
+        if (shoe.abort)
+            return 0;
+        printf("%x ", raw);
+        switch (format) {
+            case format_B:
+                *result = _int8_to_intermediate(raw & 0xff);
+                break;
+            case format_W:
+                *result = _int16_to_intermediate(raw & 0xffff);
+                break;
+            case format_L:
+                *result = _int32_to_intermediate(raw);
+                break;
+            case format_S:
+                *result = _single_to_intermediate(raw);
+                break;
+            default:
+                assert(0); /* never get here */
+        }
+    }
+    else { // if (size > 4) -> if format is double, extended, or packed
+        uint8_t buf[12];
+        uint32_t i;
+        
+        for (i = 0; i < size; i++) {
+            buf[i] = lget(addr + i, 1);
+            slog("%02x", buf[i]);
+            if (shoe.abort)
+                return 0;
+        }
+        
+        switch (format) {
+            case format_D:
+                *result = _double_to_intermediate(buf);
+                break;
+            case format_X:
+                *result = _extended_to_intermediate(buf);
+                break;
+            case format_Ps:
+            // case format_Pd: // not possible as a src specifier
+                // FIXME: implement packed formats
+                assert(!"Somebody tried to use a packed format!\n");
+                // _throw_illegal_instruction();
+                // return 0;
+            default:
+                assert(0); // never get here
+        }
+    }
     
-    // fpu_finalize_exceptions();
+got_data:
+    printf("\n");
+    return 1;
 }
 
-void dis_fmovecr(uint16_t op, uint16_t ext)
+#pragma mark Hacky low-precision transcendental implementations
+/*
+ * s -> sign, e -> biased exponent
+ * ma -> 48 high bits of the mantissa
+ * mb -> 64 low bits of the mantissa
+ */
+#define _assemble_float128(s, e, ma, mb) ({ \
+    const uint64_t _ma = (ma), _mb = (mb); \
+    const uint64_t _e = (e), _s = (s); \
+    float128 f = { \
+        .high = ((_s != 0) << 16) | (_e & 0x7fff), \
+        .low = _mb \
+    }; \
+    f.high = ((f.high) << 48) | _ma; \
+    f; \
+})
+
+#define HACKY_MATH_X86
+#ifdef HACKY_MATH_X86
+#define NATIVE double
+
+double _to_native(float128 f128)
 {
-    ~decompose(ext, 010111 rrr xxxxxxx);
-    
-    sprintf(dis.str, "fmovecr.x 0x%02x,fp%u", x, r);
+    float64 f64 = float128_to_float64(f128);
+    double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    ptr[7] = (f64 >> 56) & 0xff;
+    ptr[6] = (f64 >> 48) & 0xff;
+    ptr[5] = (f64 >> 40) & 0xff;
+    ptr[4] = (f64 >> 32) & 0xff;
+    ptr[3] = (f64 >> 24) & 0xff;
+    ptr[2] = (f64 >> 16) & 0xff;
+    ptr[1] = (f64 >> 8) & 0xff;
+    ptr[0] = (f64 >> 0) & 0xff;
+    return result;
 }
 
-void inst_fmovem_control(uint16_t op, uint16_t ext)
+float128 _from_native(double n)
 {
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
+    float64 f64 = 0;
+    uint8_t *ptr = (uint8_t*)&n;
+    f64 = (f64 << 8) | ptr[7];
+    f64 = (f64 << 8) | ptr[6];
+    f64 = (f64 << 8) | ptr[5];
+    f64 = (f64 << 8) | ptr[4];
+    f64 = (f64 << 8) | ptr[3];
+    f64 = (f64 << 8) | ptr[2];
+    f64 = (f64 << 8) | ptr[1];
+    f64 = (f64 << 8) | ptr[0];
+    return float64_to_float128(f64);
+}
+
+#include <math.h>
+#define _native_cos(a) cos(a)
+#define _native_acos(a) acos(a)
+#define _native_cosh(a) cosh(a)
+#define _native_sin(a) sin(a)
+#define _native_asin(a) asin(a)
+#define _native_sinh(a) sinh(a)
+#define _native_tan(a) tan(a)
+#define _native_atan(a) atan(a)
+#define _native_tanh(a) tanh(a)
+#define _native_atanh(a) atanh(a)
+#define _native_pow(a, b) pow((a), (b))
+#define _native_exp(a) exp(a)
+#define _native_expm1(a) (exp(a) - 1.0) /* or expm1() */
+#define _native_log10(a) log10(a)
+#define _native_log2(a) (log(a) / log(2.0)) /* or log2() */
+#define _native_log(a) log(a)
+#define _native_log1p(a) log((a) + 1.0) /* or log1p() */
+
+const double _native_e = 2.71828182845904509;
+const double _native_10 = 10.0;
+const double _native_2 = 2.0;
+const double _native_1 = 1.0;
+
+#elif (defined(HACKY_MATH_PPC))
+#error "PowerPC hacky math isn't implemented yet"
+#else
+#error "You need to define HACKY_MATH_X86, or implement one for your arch"
+#endif
+
+static float128 _hack_cos (float128 x) {
+    return _from_native(_native_cos(_to_native(x)));
+}
+
+static float128 _hack_acos (float128 x) {
+    return _from_native(_native_acos(_to_native(x)));
+}
+
+static float128 _hack_cosh (float128 x) {
+    return _from_native(_native_cosh(_to_native(x)));
+}
+
+static float128 _hack_sin (float128 x) {
+    return _from_native(_native_sin(_to_native(x)));
+}
+
+static float128 _hack_asin (float128 x) {
+    return _from_native(_native_asin(_to_native(x)));
+}
+
+static float128 _hack_sinh (float128 x) {
+    return _from_native(_native_sinh(_to_native(x)));
+}
+
+static float128 _hack_tan (float128 x) {
+    return _from_native(_native_tan(_to_native(x)));
+}
+
+static float128 _hack_atan (float128 x) {
+    return _from_native(_native_atan(_to_native(x)));
+}
+
+static float128 _hack_tanh (float128 x) {
+    return _from_native(_native_tanh(_to_native(x)));
+}
+
+static float128 _hack_atanh (float128 x) {
+    return _from_native(_native_atanh(_to_native(x)));
+}
+
+static float128 _hack_etox (float128 x) {
+    return _from_native(_native_exp(_to_native(x)));
+}
+
+static float128 _hack_etoxm1 (float128 x) {
+    return _from_native(_native_expm1(_to_native(x)));
+}
+
+static float128 _hack_log10 (float128 x) {
+    return _from_native(_native_log10(_to_native(x)));
+}
+
+static float128 _hack_log2 (float128 x) {
+    return _from_native(_native_log2(_to_native(x)));
+}
+
+static float128 _hack_logn (float128 x) {
+    return _from_native(_native_log(_to_native(x)));
+}
+
+static float128 _hack_lognp1 (float128 x) {
+    return _from_native(_native_log1p(_to_native(x)));
+}
+
+static float128 _hack_tentox (float128 x) {
+    return _from_native(_native_pow(_native_10, _to_native(x)));
+}
+
+static float128 _hack_twotox (float128 x) {
+    return _from_native(_native_pow(_native_2, _to_native(x)));
+}
+
+#pragma mark FMATH! and all its helpers
+
+/* Function prototypes from SoftFloat/softfloat-specialize.h */
+char float128_is_nan(float128 a);
+char float128_is_signaling_nan (float128 a);
+
+
+static void inst_fmath_fmovecr (void)
+{
+    fpu_get_state_ptr();
+
+    /*
+     * FYI: these constants are stored in the "intermediate" 85-bit
+     *      format in the 6888x rom. This has the side effect that
+     *      they are rounded according to fpcr.mc_rnd.
+     *      We emulate the intermediate 85-bit format with float128.
+     */
+    
+    switch (fpu->fmath_op) {
+        case 0x00: // pi
+            fpu->result = _assemble_float128(0, 0x4000, 0x921fb54442d1, 0x8469898cc51701b8);
+            break;
+        case 0x0b: // log_10(2)
+            fpu->result = _assemble_float128(0, 0x3ffd, 0x34413509f79f, 0xef311f12b35816f9);
+            break;
+        case 0x0c: // e
+            fpu->result = _assemble_float128(0, 0x4000, 0x5bf0a8b14576, 0x95355fb8ac404e7a);
+            break;
+        case 0x0d: // log_2(e)
+            fpu->result = _assemble_float128(0, 0x3fff, 0x71547652b82f, 0xe1777d0ffda0d23a);
+            break;
+        case 0x0e: // log_10(e)
+            // NOTE: 68881 doesn't set inex2 for this one
+            // Also note: that's bogus. 68881 uses 3 trailing mantissa bits to do rounding,
+            // and those bits are non-zero for this number, so it must actually be stored
+            // incorrectly in the ROM.
+            // I'll emulate this by truncating the float128 mantissa.
+            
+            // fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a6ab7555f5a67);
+            fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a000000000000);
+            break;
+        case 0x0f: // 0.0
+            fpu->result = _assemble_float128(0, 0, 0, 0);
+            break;
+        case 0x30: // ln(2)
+            fpu->result = _assemble_float128(0, 0x3ffe, 0x62e42fefa39e, 0xf35793c7673007e5);
+            break;
+        case 0x31: // ln(10)
+            fpu->result = _assemble_float128(0, 0x4000, 0x26bb1bbb5551, 0x582dd4adac5705a6);
+            break;
+        case 0x32: // 1 (68kprm has typesetting issues everywhere. This one says 100, but means 10^0.)
+            fpu->result = _assemble_float128(0, 0x3fff, 0x0, 0x0);
+            break;
+        case 0x33: // 10
+            fpu->result = _assemble_float128(0, 0x4002, 0x400000000000, 0x0);
+            break;
+        case 0x34: // 10^2
+            fpu->result = _assemble_float128(0, 0x4005, 0x900000000000, 0x0);
+            break;
+        case 0x35: // 10^4
+            fpu->result = _assemble_float128(0, 0x400c, 0x388000000000, 0x0);
+            break;
+        case 0x36: // 10^8
+            fpu->result = _assemble_float128(0, 0x4019, 0x7d7840000000, 0x0);
+            break;
+        case 0x37: // 10^16
+            fpu->result = _assemble_float128(0, 0x4034, 0x1c37937e0800, 0x0);
+            break;
+        case 0x38: // 10^32
+            fpu->result = _assemble_float128(0, 0x4069, 0x3b8b5b5056e1, 0x6b3be04000000000);
+            break;
+        case 0x39: // 10^64
+            fpu->result = _assemble_float128(0, 0x40d3, 0x84f03e93ff9f, 0x4daa797ed6e38ed6);
+            break;
+        case 0x3a: // 10^128
+            fpu->result = _assemble_float128(0, 0x41a8, 0x27748f9301d3, 0x19bf8cde66d86d62);
+            break;
+        case 0x3b: // 10^256
+            fpu->result = _assemble_float128(0, 0x4351, 0x54fdd7f73bf3, 0xbd1bbb77203731fd);
+            break;
+        case 0x3c: // 10^512
+            fpu->result = _assemble_float128(0, 0x46a3, 0xc633415d4c1d, 0x238d98cab8a978a0);
+            break;
+        case 0x3d: // 10^1024
+            fpu->result = _assemble_float128(0, 0x4d48, 0x92eceb0d02ea, 0x182eca1a7a51e316);
+            break;
+        case 0x3e: // 10^2048
+            fpu->result = _assemble_float128(0, 0x5a92, 0x3d1676bb8a7a, 0xbbc94e9a519c6535);
+            break;
+        case 0x3f: // 10^4096
+            fpu->result = _assemble_float128(0, 0x7525, 0x88c0a4051441, 0x2f3592982a7f0094);
+            break;
+        default:
+            /*
+             * I wanted to include the actual values for the other ROM offsets,
+             * but they might be proprietary. Most of them are 0 anyways, and some
+             * cause FPU exceptions, even with all exceptions disabled... (?)
+             * 68040 FPSP just returns 0, so we'll do that too.
+             */
+            fpu->result = _assemble_float128(0, 0, 0, 0);
+            return ;
+    }
+}
+
+/*
+ * This is quick macro.pl macro to build a jump table
+ * for fmath instructions. It is probably slightly slower 
+ * to use a jump table rather than a big switch statement,
+ * but I think it looks cleaner.
+ */
+~newmacro(create_fmath_jump_table, 0, {
+    my $name_map = {};
+    my $op_map = {};
+    my $add = sub {
+        my $op = shift;
+        my $name = lc(shift);
+        my $mode = 'foo';
+        my $arch = 68881;
+        
+        foreach my $arg (@_) {
+            if (($arg eq 'monadic') or ($arg eq 'dyadic')) {
+                $mode = $arg;
+            }
+            elsif ($arg == 68040) {
+                $arch = $arg;
+            }
+            else {
+                croak("bad arg $arg");
+            }
+        }
+        
+        croak("didn't specify mode") if ($mode eq "foo");
+        croak("dup $op $name") if exists $op_map->{$op};
+        croak("bogus") if ($op > 127);
+        
+        $op_map->{$op} = {op => $op, name => $name, mode => $mode, arch => $arch};
+        $name_map->{$name} = $op_map->{$op};
+    };
+    
+    $add->(~b(1000000), 'fmove',    'monadic', 68040);
+    $add->(~b(1000100), 'fmove',    'monadic', 68040);
+    $add->(~b(0000000), 'fmove',    'monadic');
+    
+    $add->(~b(0000001), 'fint',     'monadic');
+    $add->(~b(0000010), 'fsinh',    'monadic');
+    $add->(~b(0000011), 'fintrz',   'monadic');
+    
+    $add->(~b(1000001), 'fsqrt',    'monadic', 68040);
+    $add->(~b(1000101), 'fsqrt',    'monadic', 68040);
+    $add->(~b(0000100), 'fsqrt',    'monadic');
+    
+    $add->(~b(0000110), 'flognp1',  'monadic');
+    $add->(~b(0001000), 'fetoxm1',  'monadic');
+    $add->(~b(0001001), 'ftanh',    'monadic');
+    $add->(~b(0001010), 'fatan',    'monadic');
+    $add->(~b(0001100), 'fasin',    'monadic');
+    $add->(~b(0001101), 'fatanh',   'monadic');
+    $add->(~b(0001110), 'fsin',     'monadic');
+    $add->(~b(0001111), 'ftan',     'monadic');
+    $add->(~b(0010000), 'fetox',    'monadic');
+    $add->(~b(0010001), 'ftwotox',  'monadic');
+    $add->(~b(0010010), 'ftentox',  'monadic');
+    $add->(~b(0010100), 'flogn',    'monadic');
+    $add->(~b(0010101), 'flog10',   'monadic');
+    $add->(~b(0010110), 'flog2',    'monadic');
+    
+    $add->(~b(1011000), 'fabs',     'monadic', 68040);
+    $add->(~b(1011100), 'fabs',     'monadic', 68040);
+    $add->(~b(0011000), 'fabs',     'monadic');
+    
+    $add->(~b(0011001), 'fcosh',    'monadic');
+    
+    $add->(~b(1011010), 'fneg',     'monadic', 68040);
+    $add->(~b(1011110), 'fneg',     'monadic', 68040);
+    $add->(~b(0011010), 'fneg',     'monadic');
+    
+    $add->(~b(0011100), 'facos',    'monadic');
+    $add->(~b(0011101), 'fcos',     'monadic');
+    $add->(~b(0011110), 'fgetexp',  'monadic');
+    $add->(~b(0011111), 'fgetman',  'monadic');
+    
+    $add->(~b(1100000), 'fdiv',     'dyadic', 68040);
+    $add->(~b(1100100), 'fdiv',     'dyadic', 68040);
+    $add->(~b(0100000), 'fdiv',     'dyadic');
+    
+    $add->(~b(0100001), 'fmod',     'dyadic');
+    
+    $add->(~b(1100010), 'fadd',     'dyadic', 68040);
+    $add->(~b(1100110), 'fadd',     'dyadic', 68040);
+    $add->(~b(0100010), 'fadd',     'dyadic');
+    
+    $add->(~b(1100011), 'fmul',     'dyadic', 68040);
+    $add->(~b(1100111), 'fmul',     'dyadic', 68040);
+    $add->(~b(0100011), 'fmul',     'dyadic');
+    
+    $add->(~b(0100100), 'fsgldiv',  'dyadic');
+    $add->(~b(0100101), 'frem',     'dyadic');
+    $add->(~b(0100110), 'fscale',   'dyadic');
+    $add->(~b(0100111), 'fsglmul',  'dyadic');
+    
+    $add->(~b(1101000), 'fsub',     'dyadic', 68040);
+    $add->(~b(1101100), 'fsub',     'dyadic', 68040);
+    $add->(~b(0101000), 'fsub',     'dyadic');
+    
+    $add->(~b(0110000), 'fsincos',  'monadic');
+    $add->(~b(0110001), 'fsincos',  'monadic');
+    $add->(~b(0110010), 'fsincos',  'monadic');
+    $add->(~b(0110011), 'fsincos',  'monadic');
+    $add->(~b(0110100), 'fsincos',  'monadic');
+    $add->(~b(0110101), 'fsincos',  'monadic');
+    $add->(~b(0110110), 'fsincos',  'monadic');
+    $add->(~b(0110111), 'fsincos',  'monadic');
+    
+    
+    $add->(~b(0111000), 'fcmp',     'dyadic');
+    $add->(~b(0111010), 'ftst',     'monadic');
+    
+    my $map_str = "fmath_impl_t *_fmath_map[128] = {\n";
+    my @inst_flags = (0) x 128;
+    
+    for (my $i=0; $i < 128; $i++) {
+        my $func_ptr = "NULL";
+        if (exists $op_map->{$i}) {
+            $func_ptr = 'inst_fmath_' . $op_map->{$i}->{name};
+            if ($op_map->{$i}->{mode} eq 'dyadic') {
+                $inst_flags[$i] |= 1;
+            }
+            if ($op_map->{$i}->{arch} == 68040) {
+                $inst_flags[$i] |= 2;
+            }
+        }
+
+        $map_str .= "\t" . $func_ptr . ",\n";
+    }
+    $map_str .= "};\n\nuint8_t _fmath_flags[128] = {\n";
+    
+    for (my $i=0; $i < 128; $i++) {
+        $map_str .= "\t" . sprintf('0x%02x', $inst_flags[$i]) . ",\n";
+    }
+    $map_str .= "};\n";
+    
+    $map_str .= "const char *_fmath_names[128] = {\n";
+    for (my $i=0; $i < 128; $i++) {
+        my $name = "f???";
+        if (exists $op_map->{$i}) {
+            $name = $op_map->{$i}->{name};
+        }
+        $map_str .= "\t\"" . $name . "\",\n";
+    }
+    $map_str .= "};\n";
+    
+    return $map_str;
+})
+
+static _Bool _float128_is_zero (float128 f)
+{
+    return ((f.high << 1) == 0) && (f.low == 0);
+}
+
+static _Bool _float128_is_neg (float128 f)
+{
+    return f.high >> 63;
+}
+
+static _Bool _float128_is_infinity (float128 f)
+{
+    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
+    const uint64_t frac_b = f.low;
+    const uint16_t exp = (f.high >> 48) & 0x7fff;
+    
+    return (exp == 0x7fff) && ((frac_a | frac_b) == 0);
+}
+
+static _Bool _float128_is_nan (float128 f)
+{
+    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
+    const uint64_t frac_b = f.low;
+    const uint16_t exp = (f.high >> 48) & 0x7fff;
+    
+    return (exp == 0x7fff) && ((frac_a | frac_b) != 0);
+}
+
+const float128 _nan128 = {
+    .high = 0xFFFF800000000000ULL,
+    .low = 0
+};
+
+const float128 _one128 = {
+    .high = 0x3fff000000000000ULL,
+    .low = 0
+};
+
+const float128 _zero128 = {
+    .high = 0,
+    .low = 0
+};
+
+static void inst_fmath_fabs ()
+{
+    fpu_get_state_ptr();
+    
+    /* Clear the sign bit */
+    fpu->result = fpu->source;
+    fpu->result.high <<= 1;
+    fpu->result.high >>= 1;
+}
+
+static void inst_fmath_facos ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* Find the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is zero, result is +pi/2 */
+    if (source_zero) {
+        fpu->result = _assemble_float128(0, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
+        return;
+    }
+    /* If source isn't in range [-1, 1], return nan, set operr */
+    else if (!float128_le(tmp, _one128)) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_acos(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fadd ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_add(fpu->dest, fpu->source);
+    
+    /* 
+     * Throw operr (and return NaN) if operands are infinities
+     * with opposite signs. (I *think* softfloat is doing this
+     * corectly - the code's hard to read.)
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fasin ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* Find the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source isn't in range [-1, 1], return nan, set operr */
+    else if (!float128_le(tmp, _one128)) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_asin(fpu->source);
+    /* Set inex2?? */
+    /* Set unfl?? */
+}
+
+static void inst_fmath_fatan ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is +-pi/2 */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(source_sign, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
+        return ;
+    }
+    
+    fpu->result = _hack_atan(fpu->source);
+    /* Set inex2?? */
+    /* Set unfl?? */
+}
+
+static void inst_fmath_fatanh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* Take the absolute value of source */
+    float128 tmp = fpu->source;
+    tmp.high <<= 1;
+    tmp.high >>= 1;
+    
+    /* If source is 0, return source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If |source| == 1.0, set dz, return +-inf */
+    else if (float128_eq(tmp, _one128)) {
+        es_dz = 1;
+        fpu->result = _assemble_float128(source_sign, 0x7fff, 0, 0);
+        return;
+    }
+    /* If |source| > 1.0, set operr, return nan */
+    else if (!float128_le(tmp, _one128)) {
+        es_operr = 1;
+        fpu->result = _nan128;
+        return ;
+    }
+    
+    fpu->result = _hack_atanh(fpu->source);
+}
+
+static void inst_fmath_fcmp ()
+{
+    fpu_get_state_ptr();
+    
+    /* Don't write the result back to the register */
+    fpu->write_back = 0;
+    
+    fpu->result = float128_sub(fpu->dest, fpu->source);
+    
+    /*
+     * The 68881 docs say fcmp doesn't throw any exceptions
+     * based on the result, but I'm not sure I believe it.
+     
+     if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+     
+     if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+     */
+}
+
+static void inst_fmath_fcos ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_cos(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fcosh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return;
+    }
+    /* If source is +/- inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(0, 0x7fff, 0, 0);
+        return;
+    }
+    
+    fpu->result = _hack_cosh(fpu->source);
+}
+
+static void inst_fmath_fdiv ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_div(fpu->dest, fpu->source);
+    
+    /* Throw operr (and return NaN) if both operands are zero */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw divide-by-zero if dividend is zero */
+    if (float_exception_flags & float_flag_divbyzero)
+        es_dz = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fetox ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = _zero128;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_etox(fpu->source);
+}
+
+static void inst_fmath_fetoxm1 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = negone;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_etoxm1(fpu->source);
+}
+
+static void inst_fmath_fgetexp ()
+{
+    fpu_get_state_ptr();
+    
+    /* If source is INF, set operr and return NaN */
+    if (((fpu->source.high << 1) == 0xfffe000000000000ULL) && (fpu->source.low == 0)) {
+        es_operr = 1;
+        fpu->result.high = 0xffff000000000000ULL;
+        fpu->result.low = 0xc000000000000000ULL;
+        return ;
+    }
+    
+    /*
+     * If source is 0, return source.
+     * According to 68881 docs, the result needs to have 
+     * the same sign as the source (why?)
+     */
+    if (((fpu->source.high << 1) == 0) && (fpu->source.low == 0)) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    /*
+     * Otherwise, extract the biased exponent, convert it
+     * to a two's complement integer, and store that value
+     * as a float.
+     */
+    const uint32_t biased = (fpu->source.high << 1) >> 49;
+    fpu->result = int32_to_float128(((int32_t)biased) - 16383);
+}
+
+static void inst_fmath_fgetman ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fgetman not implemented");
+}
+
+static void inst_fmath_fint ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_round_to_int(fpu->source);
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+}
+
+static void inst_fmath_fintrz ()
+{
+    fpu_get_state_ptr();
+    
+    /* Same as fint, but force the round-to-zero mode */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    float_rounding_mode = float_round_to_zero;
+    fpu->result = float128_round_to_int(fpu->source);
+    float_rounding_mode = old_round_mode;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+}
+
+static void inst_fmath_flog10 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_log10(fpu->source);
+}
+
+static void inst_fmath_flog2 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_log2(fpu->source);
+}
+
+static void inst_fmath_flognp1 ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is -1.0, set dz, result is -inf */
+    else if (float128_eq(negone, fpu->source)) {
+        es_dz = 1;
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        return;
+    }
+    /* If source < -1.0, set operr, result is nan */
+    else if (float128_lt(fpu->source, negone)) {
+        es_operr = 1;
+        fpu->result = _nan128;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_lognp1(fpu->source);
+}
+
+static void inst_fmath_flogn ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, set dz, result is -inf */
+    if (source_zero) {
+        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
+        es_dz = 1;
+        return;
+    }
+    /* If source is negative, set operr, result is nan */
+    else if (source_sign) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return;
+    }
+    /* If source is +inf, result is +inf. */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_logn(fpu->source);
+}
+
+static void inst_fmath_fmove ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = fpu->source;
+}
+
+static void inst_fmath_fmul ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_mul(fpu->dest, fpu->source);
+    
+    /*
+     * Throw operr (and return NaN) if one operand is infinity
+     * and the other is zero.
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fneg ()
+{
+    fpu_get_state_ptr();
+    
+    /* Flip the sign bit */
+    fpu->result = fpu->source;
+    fpu->result.high ^= (1ULL << 63);
+    
+    /* 
+     * FIXME: you're supposed to throw UNFL if this is a
+     *        denormalized number, I think.
+     */
+}
+
+static void inst_fmath_frem ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool dest_zero = _float128_is_zero(fpu->dest);
+    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
+    
+    /* I just assume the quotient/sign are 0 for the following cases */
+    qu_quotient = 0;
+    qu_s = 0;
+    
+    /* If source is zero, result is nan */
+    if (source_zero) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If dest (but not source) is zero, result is that zero */
+    else if (dest_zero) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    /* If dest is infinity, result is nan */
+    else if (dest_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If source, but not dest, is infinity, result is dest */
+    else if (source_inf) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    
+    /* -- We're past the edge cases, do the actual op -- */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    
+    /* frem uses round-to-nearest */
+    float_rounding_mode = float_round_nearest_even;
+    
+    float128 N = float128_div(fpu->dest, fpu->source);
+    N = float128_round_to_int(N);
+    
+    float_rounding_mode = old_round_mode;
+    
+    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
+    
+    /* FIXME: not sure how to set unfl reliably */
+    
+    _Bool sign = N.high >> 63; /* Remember the sign */
+    N.high <<= 1; /* Clear the sign */
+    N.high >>= 1;
+    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
+    qu_quotient = final & 0x7f;
+    qu_s = sign;
+}
+
+static void inst_fmath_fmod ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool dest_zero = _float128_is_zero(fpu->dest);
+    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
+    
+    /* I just assume the quotient/sign are 0 for the following cases */
+    qu_quotient = 0;
+    qu_s = 0;
+    
+    /* If source is zero, result is nan */
+    if (source_zero) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If dest (but not source) is zero, result is that zero */
+    else if (dest_zero) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    /* If dest is infinity, result is nan */
+    else if (dest_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    /* If source, but not dest, is infinity, result is dest */
+    else if (source_inf) {
+        fpu->result = fpu->dest;
+        return ;
+    }
+    
+    /* -- We're past the edge cases, do the actual op -- */
+    
+    const signed char old_round_mode = float_rounding_mode;
+    
+    /* fmod uses round-to-zero */
+    float_rounding_mode = float_round_to_zero;
+    
+    float128 N = float128_div(fpu->dest, fpu->source);
+    N = float128_round_to_int(N);
+    
+    float_rounding_mode = old_round_mode;
+    
+    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
+    
+    /* FIXME: not sure how to set unfl reliably */
+    
+    _Bool sign = N.high >> 63; /* Remember the sign */
+    N.high <<= 1; /* Clear the sign */
+    N.high >>= 1;
+    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
+    qu_quotient = final & 0x7f;
+    qu_s = sign;
+}
+
+static void inst_fmath_fscale ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fscale not implemented");
+}
+
+static void inst_fmath_fsgldiv ()
+{
+    fpu_get_state_ptr();
+    
+    float128 source = fpu->source;
+    float128 dest = fpu->dest;
+    
+    /* Dump the low 88 bits of the source/dest mantissas */
+    source.low = 0;
+    source.high &= 0xffffffffff000000;
+    dest.low = 0;
+    dest.high &= 0xffffffffff000000;
+    
+    fpu->result = float128_div(dest, source);
+    
+    /* Throw operr (and return NaN) if both operands are zero */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw divide-by-zero if dividend is zero */
+    if (float_exception_flags & float_flag_divbyzero)
+        es_dz = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fsglmul ()
+{
+    fpu_get_state_ptr();
+    
+    /*
+     * As far as I can tell, fsglmul/fsgldiv use an ALU
+     * for the mantissa that is only 24-bits wide. Everything
+     * else is done with regular internal precision.
+     */
+    
+    float128 source = fpu->source;
+    float128 dest = fpu->dest;
+    
+    /* Dump the low 88 bits of the source/dest mantissas */
+    source.low = 0;
+    source.high &= 0xffffffffff000000;
+    dest.low = 0;
+    dest.high &= 0xffffffffff000000;
+    
+    fpu->result = float128_mul(dest, source);
+    
+    /*
+     * Throw operr (and return NaN) if one operand is infinity
+     * and the other is zero.
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_fsin ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_sin(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_fsincos ()
+{
+    fpu_get_state_ptr();
+    
+    assert(!"fmath: fsincos not implemented");
+}
+
+static void inst_fmath_fsinh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero or inf, return source */
+    if (source_zero || source_inf) {
+        fpu->result = fpu->source;
+        return;
+    }
+    
+    fpu->result = _hack_sinh(fpu->source);
+}
+
+static void inst_fmath_fsqrt ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_sqrt(fpu->source);
+    
+    /* Throw operr (and return NaN) if the operand is < 0 */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+}
+
+static void inst_fmath_fsub ()
+{
+    fpu_get_state_ptr();
+    
+    fpu->result = float128_sub(fpu->dest, fpu->source);
+    
+    /*
+     * Throw operr (and return NaN) if operands are infinities
+     * with equal signs. (I *think* softfloat is doing this
+     * corectly - the code's hard to read.)
+     *
+     * Both 68kprm and 68881 docs say that (+inf) - (-inf) = (-inf)
+     * but I presume that's a typo, and it's supposed to be (+inf)
+     */
+    if (float_exception_flags & float_flag_invalid)
+        es_operr = 1;
+    
+    /* Throw inex2 if the result is inexact */
+    if (float_exception_flags & float_flag_inexact)
+        es_inex2 = 1;
+    
+    /* Throw ovfl if the op overflowed */
+    if (float_exception_flags & float_flag_overflow)
+        es_ovfl = 1;
+    
+    /* Throw unfl if the op overflowed */
+    if (float_exception_flags & float_flag_underflow)
+        es_unfl = 1;
+}
+
+static void inst_fmath_ftan ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is inf, result is nan, and set operr */
+    else if (source_inf) {
+        fpu->result = _nan128;
+        es_operr = 1;
+        return ;
+    }
+    
+    fpu->result = _hack_tan(fpu->source);
+    /* Set inex2?? */
+}
+
+static void inst_fmath_ftanh ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is source */
+    if (source_zero) {
+        fpu->result = fpu->source;
+        return;
+    }
+    /* If source is +/- inf, result is +/- 1.0 */
+    else if (source_inf) {
+        fpu->result = _assemble_float128(source_sign, 0x3fff, 0, 0);
+        return;
+    }
+    
+    fpu->result = _hack_tanh(fpu->source);
+}
+
+static void inst_fmath_ftentox ()
+{
+    fpu_get_state_ptr();
+    
+    // _hack_ftentox() is broken on clang 3.5 on osx 10.10
+    // (tries to optimize pow(10.0, x) to __exp10(x), and __exp10
+    //  isn't implemented in the 10.8 SDK)
+//    const _Bool source_zero = _float128_is_zero(fpu->source);
+//    const _Bool source_inf = _float128_is_infinity(fpu->source);
+//    const _Bool source_sign = _float128_is_neg(fpu->source);
+//    
+//    /* If source is zero, result is +1.0 */
+//    if (source_zero) {
+//        fpu->result = _one128;
+//        return ;
+//    }
+//    /* if source is -inf, result is +0.0 */
+//    else if (source_inf && source_sign) {
+//        fpu->result = _zero128;
+//        return ;
+//    }
+//    /* if source is +inf, result is +inf */
+//    else if (source_inf) {
+//        fpu->result = fpu->source;
+//        return ;
+//    }
+//    
+//    fpu->result = _hack_tentox(fpu->source);
+//    
+    assert(!"fmath: ftentox not implemented");
+}
+
+static void inst_fmath_ftst ()
+{
+    fpu_get_state_ptr();
+    
+    /* Don't write the result back to the register */
+    fpu->write_back = 0;
+    
+    /* ftst just sets the cond codes according to the source */
+    fpu->result = fpu->source;
+}
+
+static void inst_fmath_ftwotox ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool source_zero = _float128_is_zero(fpu->source);
+    const _Bool source_inf = _float128_is_infinity(fpu->source);
+    const _Bool source_sign = _float128_is_neg(fpu->source);
+    
+    /* If source is zero, result is +1.0 */
+    if (source_zero) {
+        fpu->result = _one128;
+        return ;
+    }
+    /* if source is -inf, result is +0.0 */
+    else if (source_inf && source_sign) {
+        fpu->result = _zero128;
+        return ;
+    }
+    /* if source is +inf, result is +inf */
+    else if (source_inf) {
+        fpu->result = fpu->source;
+        return ;
+    }
+    
+    fpu->result = _hack_twotox(fpu->source);
+}
+
+typedef void (fmath_impl_t)(void);
+#define FMATH_TYPE_DYADIC 1
+#define FMATH_TYPE_68040 2
+~create_fmath_jump_table()
+
+
+/*
+ * Take fpu->result, and round and crop it to the
+ * preferred precision, then return the result as
+ * a floatx80. (Set all the appropriate exception bits
+ * too)
+ *
+ * ALSO!! This checks and sets underflow/overflow
+ */
+static floatx80 _fmath_round_intermediate_result ()
+{
+    fpu_get_state_ptr();
+    floatx80 final;
+    
+    float_exception_flags = 0; // (so we can know if the result is inexact)
+    _set_rounding_mode(mc_rnd); // Set the preferred rounding mode
+    
+    if (mc_prec == prec_extended) { // extended precision
+        final = float128_to_floatx80(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+    }
+    else if (mc_prec == prec_double) { // double precision
+        float64 tmp = float128_to_float64(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+        final = float64_to_floatx80(tmp);
+    }
+    else if (mc_prec == prec_single) { // single precision
+        float32 tmp = float128_to_float32(fpu->result);
+        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
+        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
+        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
+        final = float32_to_floatx80(tmp);
+    }
+    else
+        assert(!"bogus precision mode???");
+    
+    return final;
+}
+
+static void _fmath_set_condition_codes (floatx80 val)
+{
+    fpu_get_state_ptr();
+    const uint64_t frac = val.low;
+    const uint32_t exp = val.high & 0x7fff;
+    const _Bool sign = val.high >> 15;
+    
+    /* Clear the whole CC register byte */
+    fpu->fpsr.raw &= 0x00ffffff;
+    
+    /* Check for zero */
+    cc_z = ((exp == 0) && (frac == 0));
+    
+    /* Check for negative */
+    cc_n = sign;
+    
+    /* Check for NaN */
+    cc_nan = ((exp == 0x7fff) && ((frac << 1) != 0));
+    
+    /* Check for infinity */
+    cc_i = ((exp == 0x7fff) && ((frac << 1) == 0));
+}
+
+static void _fmath_handle_nans ()
+{
+    fpu_get_state_ptr();
+    
+    const _Bool is_dyadic = _fmath_flags[fpu->fmath_op] & FMATH_TYPE_DYADIC;
+    const _Bool is_signaling = float128_is_signaling_nan(fpu->source) ||
+                                (is_dyadic && float128_is_signaling_nan(fpu->dest));
+    const _Bool is_source_nan = float128_is_nan(fpu->source);
+    const _Bool is_dest_nan = is_dyadic && float128_is_nan(fpu->dest);
+    
+    /*
+     * If the dest is NaN, or both are NaN, let the result be set to dest.
+     * (with signaling disabled)
+     */
+    if (is_dest_nan)
+        fpu->result = fpu->dest;
+    else {
+        assert(is_source_nan);
+        fpu->result = fpu->source;
+    }
+    
+    /* Set the snan exception status bit */
+    es_snan = is_signaling;
+    
+    /* Silence the result */
+    // Signaling -> 0
+    // Non-signaling -> 1
+    fpu->result.high |= 0x800000000000;
+}
+
+void dis_fmath (uint16_t op, uint16_t ext, char *output)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    /* If this is fmovecr */
+    if (src_in_ea && (source_specifier == 7)) {
+        const char *name = NULL;
+        switch (extension) {
+            case 0x00: name = "pi"; break;
+            case 0x0b: name = "log_10(2)"; break;
+            case 0x0c: name = "c"; break;
+            case 0x0d: name = "log_2(e)"; break;
+            case 0x0e: name = "log_10(e)"; break;
+            case 0x0f: name = "0.0"; break;
+            case 0x30: name = "ln(2)"; break;
+            case 0x31: name = "ln(10)"; break;
+            case 0x32: name = "1.0"; break;
+            case 0x33: name = "10.0"; break;
+            case 0x34: name = "10.0^2"; break;
+            case 0x35: name = "10.0^4"; break;
+            case 0x36: name = "10.0^8"; break;
+            case 0x37: name = "10.0^16"; break;
+            case 0x38: name = "10.0^32"; break;
+            case 0x39: name = "10.0^64"; break;
+            case 0x3a: name = "10.0^128"; break;
+            case 0x3b: name = "10.0^256"; break;
+            case 0x3c: name = "10.0^512"; break;
+            case 0x3d: name = "10.0^1024"; break;
+            case 0x3e: name = "10.0^2048"; break;
+            case 0x3f: name = "10.0^4096"; break;
+        }
+        if (name == NULL)
+            sprintf(output, "fmovecr.x #%u,fp%u", extension, dest_register);
+        else
+            sprintf(output, "fmovecr.x %s,fp%u", name, dest_register);
+        return;
+    }
+    
+    if (_fmath_map[e] == NULL) {
+            /* This instruction isn't defined */
+            sprintf(output, "fmath???");
+    }
+    else if (_fmath_map[e] == inst_fmath_fsincos) {
+        /* fsincos.<fmt> <ea>,FPc:FPs */
+        if (src_in_ea)
+            sprintf(output, "fsincos.%c %s,fp%u:fp%u", "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]), e & 7, dest_register);
+        else
+            sprintf(output, "fsincos.x fp%u,fp%u:fp%u", source_specifier, e & 7, dest_register);
+    }
+    else if (_fmath_map[e] == inst_fmath_ftst) {
+        /* ftst.<fmt> <source> */
+        if (src_in_ea)
+            sprintf(output, "ftst.%c %s", "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]));
+        else
+            sprintf(output, "ftst.x fp%u", dest_register);
+    }
+    else {
+        /* f<inst>.<fmt> <source>,<dest> */
+        if (src_in_ea)
+            sprintf(output, "%s.%c %s,fp%u", _fmath_names[e], "lsxpwdb?"[source_specifier],
+                    decode_ea_rw(M, _format_sizes[source_specifier]), dest_register);
+        else
+            sprintf(output, "%s.x fp%u,fp%u", _fmath_names[e],
+                    source_specifier, dest_register);
+    }
+}
+
+static long double _float128_to_long_double(float128 f128)
+{
+    long double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    
+    int8_t old = float_exception_flags;
+    floatx80 f80 = float128_to_floatx80(f128);
+    float_exception_flags = old;
+    
+    ptr[9] = (f80.high >> 8) & 0xff;
+    ptr[8] = (f80.high >> 0) & 0xff;
+    ptr[7] = (f80.low >> 56) & 0xff;
+    ptr[6] = (f80.low >> 48) & 0xff;
+    ptr[5] = (f80.low >> 40) & 0xff;
+    ptr[4] = (f80.low >> 32) & 0xff;
+    ptr[3] = (f80.low >> 24) & 0xff;
+    ptr[2] = (f80.low >> 16) & 0xff;
+    ptr[1] = (f80.low >> 8) & 0xff;
+    ptr[0] = (f80.low >> 0) & 0xff;
+    
+    return result;
+}
+
+static long double _floatx80_to_long_double(floatx80 f80)
+{
+    long double result;
+    uint8_t *ptr = (uint8_t*)&result;
+    
+    ptr[9] = (f80.high >> 8) & 0xff;
+    ptr[8] = (f80.high >> 0) & 0xff;
+    ptr[7] = (f80.low >> 56) & 0xff;
+    ptr[6] = (f80.low >> 48) & 0xff;
+    ptr[5] = (f80.low >> 40) & 0xff;
+    ptr[4] = (f80.low >> 32) & 0xff;
+    ptr[3] = (f80.low >> 24) & 0xff;
+    ptr[2] = (f80.low >> 16) & 0xff;
+    ptr[1] = (f80.low >> 8) & 0xff;
+    ptr[0] = (f80.low >> 0) & 0xff;
+    
+    return result;
+}
+
+static void inst_fmath (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    floatx80 rounded_result;
+    
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    slog("FPU:---\n");
+    
+    /* Throw illegal instruction for 040-only ops */
+    if (_fmath_flags[e] & FMATH_TYPE_68040) {
+        _throw_illegal_instruction();
+        return;
+    }
+    
+    /*
+     * All the documented fmath ops have an implementation in
+     * _fmath_map[]. If it's NULL, it's not documented; throw
+     * an exception.
+     * This probably matches what the 68040's behavior (I haven't
+     * checked), but the 68881 doesn't do this.
+     * 68881 throws an illegal instruction exception for all
+     * opcodes where the high (6th) bit of e is set.
+     * All other instructions seem to short circuit to the 
+     * nearest documented instruction.
+     * FIXME: consider implementing this behavior.
+     */
+    if (_fmath_map[e] == NULL) {
+        /* Unless this is fmovecr, where the extension doesn't matter */
+        if (!(src_in_ea && (source_specifier == 7))) {
+            _throw_illegal_instruction();
+            return ;
+        }
+    }
+    
+    /* We only need to load the dest reg for dyadic ops */
+    if (_fmath_flags[e] & FMATH_TYPE_DYADIC)
+        fpu->dest = floatx80_to_float128(fpu->fp[dest_register]);
+    
+    /*
+     * We'll shrink the precision and perform rounding
+     * just prior to writing back the result.
+     * Certain instructions override the precision
+     * in fpcr, so keep track of the prefered prec here.
+     */
+    enum rounding_precision_t rounding_prec = mc_prec;
+    
+    /*
+     * For all the intermediate calculations, we
+     * probably want to use nearest-rounding mode.
+     */
+    _set_rounding_mode(mode_nearest);
+    
+    /* Reset softfloat's exception flags */
+    float_exception_flags = 0;
+    
+    /* Reset fpsr's exception flags */
+    es_inex1 = 0; // this is only set for imprecisely-rounded packed inputs (not implemented)
+    es_inex2 = 0; // set if we ever lose precision (during the op or during rounding)
+    es_dz = 0;    // set if we divided by zero
+    es_unfl = 0;  // set if we underflowed (inex2 should be set too, I think)
+    es_ovfl = 0;  // set if we overflowed (inex2 should be set too, I think)
+    es_operr = 0; // set if there was an instruction specific operand error
+    es_snan = 0;  // Set if one of the inputs was a signaling NaN
+    es_bsun = 0;  // never set here
+    
+    fpu->write_back = 1; // let "do-write-back" be the default behavior
+    fpu->fmath_op = e;
+    
+    /* Handle fmovecr */
+    if (src_in_ea && (source_specifier == 7)) { // fmovecr
+        /* 
+         * 68kprm says M should be ~b(000000), but apparently
+         * any value will work for fmovecr
+         */
+        slog("FPU: fmovecr %u,fp%u\n", e, dest_register);
+        inst_fmath_fmovecr();
+        goto computation_done;
+    }
+    
+    /*
+     * Read in the source from the EA or from a register.
+     * In either case, convert the value to a float128,
+     * (that's our version of the 85-bit "intermediate" format)
+     */
+    if (src_in_ea) {
+        if (!_fpu_read_ea(source_specifier, &fpu->source))
+            return ;
+        slog("FPU: %s.%c ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
+    }
+    else {
+        fpu->source = floatx80_to_float128(fpu->fp[source_specifier]);
+        slog("FPU: %s.x ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
+    }
+
+    
+    {
+        long double tmp = _float128_to_long_double(fpu->source);
+        printf("%Lf,fp%u\n", tmp, dest_register);
+    }
+    
+    /*
+     * If the source is NaN, or this is a dyadic (two-operand)
+     * instruction, and the second operand (fpu->dest) is NaN,
+     * then the result is predetermined: NaN
+     */
+    if (float128_is_nan(fpu->source) ||
+             (((_fmath_flags[e] & FMATH_TYPE_DYADIC) &&
+               float128_is_nan(fpu->dest)))) {
+        _fmath_handle_nans();
+        goto computation_done;
+    }
+    
+    /* 
+     * Otherwise, call the extension-specific helper function.
+     * Guarantees: Neither source nor dest are NaN
+     *             SoftFloat's exception flags have been cleared
+     */
+    _fmath_map[e]();
+    
+    /* 
+     * At this point, the "computation"-phase (I forget what the correct
+     * 6888x term is) is over. Now we check exception bits, throw exceptions,
+     * compute condition codes, and round and store the result.
+     */
+computation_done:
+    
+    /* Convert the 128-bit result to the specified precision */
+    /*
+     * FIXME: If fpu->write_back==0, should we still go through rounding?
+     *        The condition codes will still need to be set. Should they
+     *        be set based on the intermediate result or rounded result?
+     */
+    rounded_result = _fmath_round_intermediate_result();
+    
+    
+    /* Update the accrued exception bits */
+    
+    assert(!es_bsun); // no fmath op can throw es_bsun
+    
+    ae_iop |= es_bsun | es_snan | es_operr;
+    ae_ovfl |= es_ovfl;
+    ae_unfl |= (es_unfl & es_inex2); // yes, &
+    ae_dz |= es_dz;
+    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
+    
+    slog("FPU: bsun=%u snan=%u operr=%u ovfl=%u unfl=%u dz=%u inex1=%u inex2=%u\n",
+           es_bsun, es_snan, es_operr, es_ovfl, es_unfl, es_dz, es_inex1, es_inex2);
+    
+    /* Are any exceptions both set and enabled? */
+    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
+        /* 
+         * Then we need to throw an exception.
+         * The exception is sent to the vector for
+         * the highest priority exception, and the priority
+         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
+         * (which is the order of the bits in fpsr/fpcr).
+         * Iterate over the bits in order, and throw the
+         * exception to whichever bit is set first.
+         */
+        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
+        
+        slog("FPU: throw exception! 0x%08x\n", throwable);
+        
+        assert(throwable);
+        for (i=0; 1; i++) {
+            if (throwable & 0x80)
+                break;
+            throwable <<= 1;
+        }
+        
+        /*
+         * Convert the exception bit position
+         * to the correct vector number, and throw
+         * a (pre-instruction) exception.
+         */
+        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
+        
+        return ;
+    }
+    
+    /*
+     * Otherwise, no exceptions to throw!
+     * Calculate the condition codes from the result.
+     */
+    _fmath_set_condition_codes(rounded_result);
+    
+    /*
+     * We're definitely running to completion now,
+     * so commit ea-read changes
+     */
+    _fpu_read_ea_commit(source_specifier);
+    
+    /* Write back the result, and we're done! */
+    if (fpu->write_back) {
+        fpu->fp[dest_register] = rounded_result;
+        
+        long double tmp = _floatx80_to_long_double(rounded_result);
+        slog("FPU: result = %Lf\n", tmp);
+    }
+}
+
+#pragma mark Second-hop non-fmath instructions
+
+/*
+ * reg->mem fmove (fmath handles all other fmoves
+ */
+static void inst_fmove (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    ~decompose(shoe.op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 011 fff sss KKKKKKK);
+    
+    _fpu_write_ea(M, f, &fpu->fp[s], K);
+}
+
+static void inst_fmovem_control (const uint16_t ext)
+{
+    fpu_get_state_ptr();
+    
+    ~decompose(shoe.op,  1111 001 000 mmmrrr);
+    ~decompose(shoe.op,  1111 001 000 MMMMMM);
     ~decompose(ext, 10d CSI 0000000000);
     
     const uint32_t count = C + S + I;
     const uint32_t size = count * 4;
-    
-    if (count == 0) // I don't know if this is even a valid instruction
-        return ;
-    
-    if ((m == 0 || m == 1) && (count > 1)) { // data and addr reg modes are valid, but only if count==1
-        throw_illegal_instruction();
-        return ;
-    }
-    
     uint32_t addr, buf[3];
     uint32_t i;
     
+    /* I don't know if this is even a valid instruction */
+    if (count == 0)
+        return ;
+    
+    /* data and addr reg modes are valid, but only if count==1 */
+    if ((m == 0 || m == 1) && (count > 1)) {
+        _throw_illegal_instruction();
+        return ;
+    }
+    
     if (d) { // reg to memory
         i=0;
-        if (C) buf[i++] = shoe.fpcr.raw;
-        if (S) buf[i++] = shoe.fpsr.raw;
-        if (I) buf[i++] = shoe.fpiar;
-
+        if (C) buf[i++] = fpu->fpcr.raw;
+        if (S) buf[i++] = fpu->fpsr.raw;
+        if (I) buf[i++] = fpu->fpiar;
+        
         if (m == 0) {
-            shoe.d[r] = buf[0];
+            if (count == 1)
+                shoe.d[r] = buf[0];
+            else
+                _throw_illegal_instruction();
             return ;
         }
         else if (m == 1) {
-            shoe.a[r] = buf[0];
+            if ((count == 1) && I)
+                shoe.a[r] = buf[0];
+            else
+                _throw_illegal_instruction();
             return ;
         }
         else if (m == 3)
@@ -549,6 +2624,11 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         else if (m == 4)
             addr = shoe.a[r] - size;
         else {
+            if ((m==7) && (r!=0 || r!=1)) {
+                /* Not allowed for reg->mem */
+                _throw_illegal_instruction();
+                return;
+            }
             call_ea_addr(M);
             addr = shoe.dat;
         }
@@ -560,10 +2640,20 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         }
     }
     else { // mem to reg
-        if (m == 0) // data reg
-            buf[0] = shoe.d[r];
-        else if (m == 1) // addr reg
-            buf[0] = shoe.a[r];
+        if (m == 0) {// data reg
+            if (count == 1)
+                buf[0] = shoe.d[r];
+            else
+                _throw_illegal_instruction();
+            return;
+        }
+        else if (m == 1) {// addr reg
+            if ((count == 1) && I)
+                buf[0] = shoe.a[r];
+            else
+                _throw_illegal_instruction();
+            return;
+        }
         else {
             if (m == 3) // post-increment
                 addr = shoe.a[r];
@@ -586,22 +2676,22 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
         i = 0;
         
         if (C) {
-            uint8_t round = shoe.fpcr.b.mc_rnd;
-            shoe.fpcr.raw = buf[i++];
-            uint8_t newround = shoe.fpcr.b.mc_rnd;
+            uint8_t round = fpu->fpcr.b._mc_rnd;
+            fpu->fpcr.raw = buf[i++];
+            uint8_t newround = fpu->fpcr.b._mc_rnd;
             
             if (round != newround) {
                 slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
             }
         }
-        if (S) shoe.fpsr.raw = buf[i++];
-        if (I) shoe.fpiar = buf[i++];
+        if (S) fpu->fpsr.raw = buf[i++];
+        if (I) fpu->fpiar = buf[i++];
         
         // Commit immediate-EA-mode PC change
         if (M == 0x3c)
             shoe.pc += size;
     }
-        
+    
     // Commit pre/post-inc/decrement
     
     if (m == 3)
@@ -612,133 +2702,16 @@ void inst_fmovem_control(uint16_t op, uint16_t ext)
     
     
     slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
+    
+    
 }
 
-void dis_fmovem_control(uint16_t op, uint16_t ext)
+static void inst_fmovem (const uint16_t ext)
 {
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 10d CSI 0000000000);
+    fpu_get_state_ptr();
     
-    if (d)
-        sprintf(dis.str, "fmovem.l [%u%u%u],%s\n", C, S, I, decode_ea_addr(M)); // <- XXX: decode_ea_addr() is the wrong function to use
-    else
-        sprintf(dis.str, "fmovem.l %s,[%u%u%u]\n", decode_ea_addr(M), C, S, I); // <- XXX: decode_ea_addr() is the wrong function to use
-}
-
-
-static uint8_t fpu_test_cc(uint8_t cc)
-{
-    const uint8_t z = shoe.fpsr.b.cc_z;
-    const uint8_t n = shoe.fpsr.b.cc_n;
-    const uint8_t nan = shoe.fpsr.b.cc_nan;
-    
-    switch (cc & 0x0f) {
-        case 0: // false
-            return 0;
-        case 1: // equal
-            return z;
-        case 2: // greater than
-            return !(nan || z || n);
-        case 3: // greater than or equal
-            return z || !(nan || n);
-        case 4: // less than
-            return n && !(nan || z);
-        case 5: // less than or equal
-            return z || (n && !nan);
-        case 6: // greater or less than
-            return !(nan || z);
-        case 7: // ordered
-            return !nan;
-        case 8: // unordered
-            return nan;
-        case 9: // not (greater or less than)
-            return nan || z;
-        case 10: // not (less than or equal)
-            return nan || !(n || z);
-        case 11: // not (less than)
-            return nan || (z || !n);
-        case 12: // not (greater than or equal)
-            return nan || (n && !z);
-        case 13: // not (greater than)
-            return nan || z || n;
-        case 14: // not equal
-            return !z;
-        case 15: // true
-            return 1;
-    }
-    
-    assert(0);
-    return 0;
-}
-
-void inst_fbcc(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
-    
-    uint32_t displacement;
-    if (s) {
-        const uint16_t ext2 = nextword();
-        displacement = (ext << 16) | ext2;
-    }
-    else {
-        const int16_t tmp = ext;
-        const int32_t tmp2 = tmp;
-        displacement = tmp2;
-    }
-    
-    if (b) {
-        slog("inst_fbcc: fixme: Got a CC that wants to set BSUN, not implemented\n");
-        //assert(0); // FIXME: implement BSUN, or uncomment this
-    }
-    
-    if (fpu_test_cc(c)) {
-        const uint32_t addr = shoe.orig_pc + 2 + displacement;
-        shoe.pc = addr;
-    }
-}
-
-const char *fpu_cc_names[32] = {
-    "f", "eq", "ogt", "oge", "olt", "ole", "ogl", "or",
-    "un", "ueq", "ugt", "uge", "ult", "ule", "ne", "t",
-    "sf", "seq", "gt", "ge", "lt", "le", "gl", "gle",
-    "ngle", "ngl", "nle", "nlt", "nge", "ngt", "sne", "st"
-};
-
-void dis_fbcc(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 01 s 0ccccc); // only the low 5 bits of cc are significant
-    
-    uint32_t displacement;
-    if (s) {
-        const uint16_t ext2 = dis_next_word();
-        displacement = (ext << 16) | ext2;
-    }
-    else {
-        const int16_t tmp = ext;
-        const int32_t tmp2 = tmp;
-        displacement = tmp2;
-    }
-    
-    const uint32_t addr = dis.orig_pc + 2 + displacement;
-    
-    sprintf(dis.str, "fb%s.%c *0x%08x", fpu_cc_names[c], "wl"[s], addr);
-}
-
-static void reverse_order(uint8_t *buf, const uint32_t size)
-{
-    uint32_t i;
-    for (i=0; i < (size/2); i++) {
-        const uint8_t tmp = buf[i];
-        buf[i] = buf[size-(1+i)];
-        buf[size-(1+i)] = tmp;
-    }
-}
-
-void inst_fmovem(uint16_t op, uint16_t ext)
-{
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(shoe.op,  1111 001 000 mmmrrr);
+    ~decompose(shoe.op,  1111 001 000 MMMMMM);
     ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
     ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
     
@@ -789,9 +2762,9 @@ void inst_fmovem(uint16_t op, uint16_t ext)
                 continue;
             
             uint8_t buf[12];
-            x87_to_motorola(shoe.fp[i], buf);
+            _floatx80_to_extended(&fpu->fp[i], buf);
             
-            slog("inst_fmovem: writing %Lf from fp%u", shoe.fp[i], i);
+            // slog("inst_fmovem: writing %Lf from fp%u", fpu->fp[i], i);
             uint32_t j;
             for (j=0; j<12; j++) {
                 slog(" %02x", buf[j]);
@@ -817,9 +2790,9 @@ void inst_fmovem(uint16_t op, uint16_t ext)
                 if (shoe.abort)
                     return ;
             }
-            shoe.fp[i] = motorola_to_x87(buf);
+            _extended_to_floatx80(buf, &fpu->fp[i]);
             
-            slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
+            // slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
         }
     }
     
@@ -828,712 +2801,305 @@ void inst_fmovem(uint16_t op, uint16_t ext)
         shoe.a[r] += size;
     else if (m == 4)
         shoe.a[r] -= size;
-    
-    //slog("inst_fmovem: notice: not implemented (EA = %u/%u, mask=0x%02x)\n", m, r, mask);
-    
 }
 
-void dis_fmovem(uint16_t op, uint16_t ext)
-{
-    ~decompose(op,  1111 001 000 mmmrrr);
-    ~decompose(op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
-    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
-    
-    sprintf(dis.str, "fmovem ???");
-}
 
-enum {
-    format_L = 0,
-    format_S = 1,
-    format_X = 2,
-    format_Ps = 3,
-    format_W = 4,
-    format_D = 5,
-    format_B = 6,
-    format_Pd = 7
-} fpu_formats;
-/*
- * 0 L     long word integer
- * 1 S     single precision real
- * 2 X     extended precision real
- * 3 P{#k} packed decimal real with static k factor
- * 4 W     word integer
- * 5 D     double precision real
- * 6 B     byte integer
- * 7 P{Dn} packed decimal real with dynamic k factor
+#pragma mark First-hop decoder table inst implementations
+/* 
+ * The table generated by decoder_gen.c will refer directly
+ * to these instructions. inst_fpu_other() will handle all
+ * other FPU instructions.
  */
 
-static void fpu_read_ea_commit(uint8_t mr, uint8_t format)
+static _Bool fpu_test_cc(uint8_t cc)
 {
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    fpu_get_state_ptr();
+    const _Bool z = cc_z;
+    const _Bool n = cc_n;
+    const _Bool nan = cc_nan;
     
-    if (m == 3)
-        shoe.a[r] += sizes[format];
-    else if (m == 4)
-        shoe.a[r] -= sizes[format];
+    switch (cc & 0x0f) {
+        case 0: // false
+            return 0;
+        case 1: // equal
+            return z;
+        case 2: // greater than
+            return !(nan | z | n);
+        case 3: // greater than or equal
+            return z | !(nan | n);
+        case 4: // less than
+            return n & !(nan | z);
+        case 5: // less than or equal
+            return z | (n & !nan);
+        case 6: // greater or less than
+            return !(nan | z);
+        case 7: // ordered
+            return !nan;
+        case 8: // unordered
+            return nan;
+        case 9: // not (greater or less than)
+            return nan | z;
+        case 10: // not (less than or equal)
+            return nan | !(n | z);
+        case 11: // not (less than)
+            return nan | (z | !n);
+        case 12: // not (greater than or equal)
+            return nan | (n & !z);
+        case 13: // not (greater than)
+            return nan | z | n;
+        case 14: // not equal
+            return !z;
+        case 15: // true
+            return 1;
+    }
+    
+    assert(0);
+    return 0;
 }
 
-// Note: fpu_read_ea modifies shoe.pc, fpu_read_ea_commit modies shoe.a[r] for pre/post-inc/decrement
-static long double fpu_read_ea(uint8_t mr, uint8_t format)
-{
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+void inst_fscc () {
+    fpu_get_state_ptr();
     
-    long double data, result;
+    // fscc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    const uint16_t ext = nextword();
+    
+    ~decompose(shoe.op, 1111 001 001 MMMMMM);
+    ~decompose(ext, 0000 0000 000 b cccc);
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    shoe.dat = fpu_test_cc(c) ? 0xff : 0;
+    
+    call_ea_write(M, 1);
+}
+
+void inst_fbcc () {
+    fpu_get_state_ptr();
+    
+    // fbcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    ~decompose(shoe.op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
+    const uint8_t sz = 2 << s;
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception 
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    if (fpu_test_cc(c)) {
+        const uint16_t ext = nextword();
+        uint32_t displacement;
+    
+        if (s) {
+            const uint16_t ext2 = nextword();
+            displacement = (ext << 16) | ext2;
+        }
+        else
+            displacement = (int16_t)ext;
+        
+        shoe.pc = shoe.orig_pc + 2 + displacement;
+    }
+    else
+        shoe.pc += sz;
+}
+
+void inst_fsave () {
+    fpu_get_state_ptr();
+    verify_supervisor();
+    
+    // Don't modify fpiar for fsave
+    
+    ~decompose(shoe.op, 1111 001 100 MMMMMM);
+    ~decompose(shoe.op, 1111 001 100 mmmrrr);
+    
+    const uint32_t size = 0x1c; // IDLE frame
+    const uint16_t frame_header = 0xfd18;
     uint32_t addr;
     
-    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
-    if ((m == 1) ||
-        ((m == 0) && (sizes[format] > 4))) {
-        throw_illegal_instruction();
-        return 0.0;
+    if (m == 4)
+        addr = shoe.a[r] - size;
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
     }
     
-    switch (m) {
-        case 0: {
-            if (format == format_S) {
-                float tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_B) {
-                int8_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_W) {
-                int16_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            else if (format == format_L) {
-                int32_t tmp = shoe.d[r];
-                data = tmp;
-            }
-            
-            goto got_data;
-        }
-            
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && sizes[format]==1));
-            goto got_address;
-            
-        case 4:
-            addr = shoe.a[r] - sizes[format];
-            assert(!( r==7 && sizes[format]==1));
-            goto got_address;
-            
-        case 7:
-            if (r == 4) {
-                addr = shoe.pc;
-                shoe.pc += sizes[format];
-                goto got_address;
-            }
-            
-            // fall through to default:
-            
-        default: {
-            
-            shoe.mr=mr;
-            ea_addr();
-            if (shoe.abort)
-                return 0.0;
-            
-            addr = (uint32_t)shoe.dat;
-            goto got_address;
-        }
-    }
+    lset(addr, 2, frame_header);
+    if (shoe.abort)
+        return ;
     
-got_address:
+    if (m == 4)
+        shoe.a[r] = addr;
     
-    {
-        uint8_t buf[12];
-        uint8_t *ptr = &buf[sizes[format]];
-        uint32_t i;
-        
-        slog("inst_f fpu_read_ea: format=%u, data =", format);
-        for (i=0; i<sizes[format]; i++) {
-            ptr--;
-            *ptr = lget(addr+i, 1);
-            slog(" %02x", *ptr);
-            if (shoe.abort)
-                return 0.0;
-        }
-        
-        switch (format) {
-            case format_B: {
-                int8_t tmp = ptr[0];
-                data = tmp;
-                break;
-            }
-            case format_W: {
-                int16_t tmp = *(int16_t*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_L: {
-                int32_t tmp = *(int32_t*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_S: {
-                float tmp = *(float*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_D: {
-                double tmp = *(double*)ptr;
-                data = tmp;
-                break;
-            }
-            case format_X: {
-                reverse_order(ptr, 12);
-                data = motorola_to_x87(ptr);
-                break;
-            }
-            default: {
-                assert(!"unsupported format (packed something)");
-            }
-        }
-    }
-    
-got_data:
-    
-    fpu_set_round();
-    result = data;
-    fpu_reset_round();
-    slog(" data=%Lf result=%Lf\n", data, result);
-    return result;
 }
 
-
-static void fpu_write_ea(uint8_t mr, uint8_t format, long double orig_data)
-{
-    fpu_set_round();
-    const long double data = orig_data;
-    fpu_reset_round();
+void inst_frestore () {
+    fpu_get_state_ptr();
+    verify_supervisor();
     
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-    uint8_t buf[12], *ptr = &buf[0];
-    uint32_t addr, i;
+    // Don't modify fpiar for frestore
     
-    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
-    if ((m == 1) ||
-        ((m == 0) && (sizes[format] > 4))) {
-        throw_illegal_instruction();
+    ~decompose(shoe.op, 1111 001 101 MMMMMM);
+    ~decompose(shoe.op, 1111 001 101 mmmrrr);
+    
+    uint32_t addr, size;
+    
+    if (m == 3)
+        addr = shoe.a[r];
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    const uint16_t word = lget(addr, 2);
+    if (shoe.abort) return ;
+    
+    // XXX: These frame sizes are different on 68881/68882/68040
+    if ((word & 0xff00) == 0x0000)
+        size = 4; // NULL state frame
+    else if ((word & 0xff) == 0x0018)
+        size = 0x1c; // IDLE state frame
+    else if ((word & 0xff) == 0x00b4)
+        size = 0xb8; // BUSY state frame
+    else {
+        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
+        assert(!"inst_frestore: bad state frame");
         return ;
     }
     
-    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, data, format);
-    
-    // Convert to the appropriate format
-    
-    switch (format) {
-        case format_B: {
-            int8_t tmp = data;
-            *((int8_t*)ptr) = tmp;
-            goto write_to_mem;
-        }
-        case format_W: {
-            int16_t tmp = data;
-            *((int16_t*)ptr) = tmp;
-            slog("inst_f fpu_write_ea formatted=%u (0x%04x)\n", *((int16_t*)ptr), *((uint16_t*)ptr));
-            break;
-        }
-        case format_L: {
-            int32_t tmp = data;
-            *((int32_t*)ptr) = tmp;
-            break;
-        }
-        case format_S: {
-            float tmp = data;
-            *((float*)ptr) = tmp;
-            break;
-        }
-        case format_D: {
-            double tmp = data;
-            *((double*)ptr) = tmp;
-            break;
-        }
-        case format_X: {
-            x87_to_motorola(data, ptr);
-            goto write_to_mem; // ptr is already big endian
-        }
-        default: {
-            assert(!"unsupported format (packed something)");
-        }
+    if (m==3) {
+        shoe.a[r] += size;
+        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
     }
-
-swap_order:
-    reverse_order(buf, sizes[format]);
-    
-    
-write_to_mem:
-    // Lookup the EA
-
-    switch (m) {
-        case 0: {
-            if (format == format_B) {
-                int8_t tmp = data;
-                set_d(r, tmp, 1);
-            }
-            else if (format == format_W) {
-                int16_t tmp = data;
-                set_d(r, tmp, 2);
-            }
-            else if (format == format_L) {
-                int32_t tmp = data;
-                shoe.d[r] = tmp;
-            }
-            else if (format == format_S) {
-                float tmp = data;
-                *((float*)&shoe.d[r]) = tmp;
-            }
-            
-            goto done;
-        }
-        case 3: // post-increment
-            addr = shoe.a[r];
-            assert(!( r==7 && sizes[format]==1));
-            break;
-        case 4: // pre-decrement
-            addr = shoe.a[r] - sizes[format];
-            assert(!( r==7 && sizes[format]==1));
-            break;
-        default:
-            call_ea_addr(mr);
-            addr = (uint32_t)shoe.dat;
-            break;
-    }
-    
-    // Copy the formatted data into the EA
-    slog("inst_f fpu_write_ea: addr=0x%08x\n", addr);
-    for (i=0; i < sizes[format]; i++) {
-        lset(addr + i, 1, buf[i]);
-        if (shoe.abort)
-            return ;
-    }
-
-done: // set condition codes and update pre/post-inc/decrement registers
-    
-    // Set condition codes
-    shoe.fpsr.raw &= 0x00ffffff;
-    shoe.fpsr.b.cc_nan = (0 != isnan(data));
-    if (!shoe.fpsr.b.cc_nan) {
-        shoe.fpsr.b.cc_n = (0 != signbit(data));
-        if (isinf(data))
-            shoe.fpsr.b.cc_i = 1;
-        else
-            shoe.fpsr.b.cc_z = (data == 0.0);
-    }
-    
-    if (m == 3)
-        shoe.a[r] += sizes[format];
-    else if (m == 4)
-        shoe.a[r] -= sizes[format];
 }
 
-void inst_fmove(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 0 E V aaa zzz KKKKKKK);
+void inst_fdbcc () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 001 001 rrr);
     
-    const uint8_t format = a;
+    // fdbcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
     
-    if (K == ~b(1000100) || K == ~b(1000000)) {
-        assert(!"inst_fmove: This is either a K-value, or somebody called fmove and specified the secret precision bits");
-    }
+    const uint16_t ext = nextword();
+    ~decompose(ext, 0000 0000 000 b cccc);
     
-    // E==0 => Don't use EA (reg->reg)
-    // E==1 => Use EA
-    // V==0 => reg->reg or mem->reg
-    // V==1 => reg->mem
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
     
-    // Load the source value into 'data'
-    
-    long double data;
-    
-    if (E && !V) { // mem -> reg
-        data = fpu_read_ea(M, format);
-        if (shoe.abort)
-            return ;
-    }
-    else if (!E) { // reg -> mem
-        data = shoe.fp[a];
-    }
-    else { // reg -> reg
-        data = shoe.fp[z];
-    }
-    
-    
-    // XXX: Check for exceptions?
-    
-    // Write the result
-    
-    if (E && V) { // reg -> mem
-        fpu_write_ea(M, format, data);
-        if (shoe.abort)
-            return ;
-    }
-    else if (!V) { // mem -> reg
-        fpu_set_reg_cc(data, z);
-        fpu_read_ea_commit(M, format);
-    }
-    else { // reg -> reg
-        fpu_set_reg_cc(data, z);
-    }
-    
-    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-    slog("inst_fmove src=%Lf size=%u a=%u z=%u to-mem=%u useEA=%u EA = %u/%u\n", data, sizes[format], a, z, V, E, m, r);
-}
-
-void dis_fnop(uint16_t op, uint16_t ext)
-{
-    sprintf(dis.str, "fnop");
-}
-
-void inst_fnop(uint16_t op, uint16_t ext)
-{
-}
-
-void dis_fmove(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 0 E V aaa bbb KKKKKKK);
-    
-    // E==0 => reg to reg
-    // E==1 => mem to reg / reg to mem
-    // V==0 => reg->reg or mem->reg
-    // V==1 => reg->mem
-    
-    
-    sprintf(dis.str, "fmove ???");
-    
-}
-
-void dis_fmath(uint16_t op, uint16_t ext)
-{
-    sprintf(dis.str, "fmath ??");
-}
-
-static void fpu_set_fpsr_quotient(long double a, long double b, long double result)
-{
-    // Thanks for being super vague on the meaning of this register, 68881 documentation
-    
-    const long double quo = truncl((a - result) / b);
-    const uint8_t sign = signbit(quo);
-    const uint64_t quo_int = fabsl(quo);
-    
-    shoe.fpsr.b.qu_quotient = quo_int & 0x7f;
-    shoe.fpsr.b.qu_s = sign;
-}
-
-void inst_fmath(uint16_t op, uint16_t ext)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    uint8_t do_write_back_result = 1;
-    
-    long double source, dest, result;
-    
-    if (src_in_ea) {
-        source = fpu_read_ea(M, source_specifier);
-        slog("inst_fmath: source = %u/%u = %Lf", M>>3, M&7, source);
-        if ((M>>3) == 3)
-            slog(" a[%u]=0x%08x", M&7, shoe.a[M&7]);
-        
-        if (shoe.abort)
-            return ;
+    if (fpu_test_cc(c)) {
+        shoe.pc += 2;
     }
     else {
-        source = shoe.fp[source_specifier];
-        slog("inst_fmath: source = fp%u = %Lf", source_specifier, source);
-    }
-    
-    dest = shoe.fp[dest_register];
-    slog("  dest = fp%u = %Lf\n", dest_register, dest);
-    
-    switch (e) {
-        case ~b(0000001): {// fpu_inst_fint
-            const uint8_t dir = shoe.fpcr.b.mc_rnd;
-            
-            // {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
-            
-            if (dir == 0)
-                result = roundl(source);
-            else if (dir == 1)
-                result = truncl(source);
-            else if (dir == 2)
-                result = floorl(source);
-            else
-                result = ceill(source);
-            
-            slog("inst_fint: source = %Lf result = %Lf round=%u\n", source, result, dir);
-            
-            break;
-        }
-        case ~b(0000010): assert(!"fpu_inst_fsinh;");
-        case ~b(0000011): // fpu_inst_fintrz
-            slog("inst_fintrz dest = %Lf source = %Lf\n", dest, source);
-            result = truncl(source);
-            break;
-            
-        case ~b(0000110): // flognp1
-            slog("inst_flognp1 dest = %Lf source = %Lf\n", dest, source);
-            assert(source > -1.0);
-            result = log1pl(source);
-            break;
-        case ~b(0001000): assert(!"fpu_inst_fetoxm1;");
-        case ~b(0001001): assert(!"fpu_inst_ftanh;");
-        case ~b(0001010): // fatan
-            slog("inst_fatan dest = %Lf source = %Lf\n", dest, source);
-            result = atanl(source);
-            break;
-            
-        case ~b(0001100): assert(!"fpu_inst_fasin;");
-        case ~b(0001101): assert(!"fpu_inst_fatanh;");
-        case ~b(0001110): // fsin
-            slog("inst_fsin dest = %Lf source = %Lf\n", dest, source);
-            result = sinl(source);
-            break;
-        case ~b(0001111): assert(!"fpu_inst_ftan;");
-        case ~b(0010000): // fetox
-            slog("inst_fetox dest = %Lf source = %Lf\n", dest, source);
-            result = expl(source);
-            break;
-        case ~b(0010001): assert(!"fpu_inst_ftwotox;");
-        case ~b(0010010): assert(!"fpu_inst_ftentox;");
-        case ~b(0010100): assert(!"fpu_inst_flogn;");
-        case ~b(0010101): assert(!"fpu_inst_flog10;");
-        case ~b(0010110): assert(!"fpu_inst_flog2;");
-        case ~b(0011001): assert(!"fpu_inst_fcosh;");
-        case ~b(0011100): assert(!"fpu_inst_facos;");
-        case ~b(0011101): // fcos
-            slog("fpu_inst_fcos dest = %Lf source = %Lf\n", dest, source);
-            result = cosl(source);
-            break;
-            
-        case ~b(0011110): {// fpu_inst_fgetexp
-            if (!((source > 0) || (source < 0)))
-                result = source; // positive or negative zero
-            else if (!isfinite(source)) {
-                assert(!"fgetexp: isinfl(source)");
-                // returns NAN and an exception bit - not implemented for the moment
-            }
-            else {
-                // Extract the debiased exponent from the 80-bit source
-                uint8_t motorola[12];
-                x87_to_motorola(source, motorola);
-                int32_t exp = (motorola[0] & 0x7f) << 8;
-                exp |= motorola[1];
-                exp -= 16383; // debias
-                result = exp;
-            }
-            break;
-        }
-        case ~b(0011111): assert(!"fpu_inst_fgetman;");
-        case ~b(0100001):
-            // don't forget to set fpu_set_fpsr_quotient();
-            assert(!"fpu_inst_fmod;");
-        
-        case ~b(0100100): assert(!"fpu_inst_fsgldiv");
-            
-        case ~b(0100101): { // fpu_inst_frem
-            assert(source != 0.0);
-            result = remainderl(dest, source);
-            fpu_set_fpsr_quotient(dest, source, result);
-            slog("inst_frem: dest = %Lf source = %Lf quot = %u result = %Lf\n", dest, source, shoe.fpsr.b.qu_quotient, result);
-            break;
-        }
-        case ~b(0100110): assert(!"fpu_inst_fscale;");
-            
-        case ~b(0111000): { // fpu_inst_fcmp
-            const long double diff = dest - source;
-            slog("inst_fcmp: dest = %Lf source = %Lf\n", dest, source);
-            fpu_set_cc(diff);
-            do_write_back_result = 0; // don't write result back to register
-            break;
-        }
-        case ~b(0111010): { // fpu_inst_ftst
-            slog("fpu_inst_ftst: dest = %Lf\n");
-            fpu_set_cc(source);
-            do_write_back_result = 0; // don't write result back to register
-            break;
-        }
-            
-        case ~b(1011100):
-        case ~b(1011000):
-            assert(!"inst_fabs: can't handle");
-        case ~b(0011000):// fpu_inst_fabs
-            result = fabsl(source);
-            slog("inst_fabs: source=%Lf result=%Lf\n", source, result);
-            break;
-            
-        case ~b(1100010):
-        case ~b(1100110):
-            assert(!"can't handle");
-        case ~b(0100010): { // fpu_inst_fadd
-            slog("inst_fadd dest = %Lf source = %Lf\n", dest, source);
-            result = dest + source;
-            break;
-        }
-            
-        case ~b(1100000):
-        case ~b(1100100):
-            assert(!"can't handle");
-        case ~b(0100000): { // fpu_inst_fdiv
-            assert(source != 0.0);
-            slog("inst_fdiv dest = %Lf source = %Lf\n", dest, source);
-            
-            result = dest / source;
-            break;
-        }
-            
-            
-        case ~b(1100011):
-        case ~b(1100111):
-            assert(!"can't handle");
-        case ~b(0100011): { // fpu_inst_fmul
-            slog("inst_fmul dest = %Lf source = %Lf\n", dest, source);
-            result = source * dest;
-            break;
-        }
-            
-        case ~b(1011010):
-        case ~b(1011110):
-            assert(!"fneg: can't handle");
-        case ~b(0011010): // fneg
-            slog("inst_fneg dest = %Lf source = %Lf\n", dest, source);
-            result = -source;
-            break;
-            
-        case ~b(1000001):
-        case ~b(1000101):
-            assert(!"can't handle");
-        case ~b(0000100): { // fpu_inst_fsqrt
-            slog("inst_fsqrt dest = %Lf source = %Lf\n", dest, source);
-            result = sqrtl(source);
-            break;
-        }
-            
-        case ~b(1101000):
-        case ~b(1101100):
-            assert(!"can't handle");
-        case ~b(0101000): { // fpu_inst_fsub
-            slog("inst_fsub dest = %Lf source = %Lf\n", dest, source);
-            result = dest - source;
-            break;
-        }
-            
-        case ~b(0110000) ... ~b(0110111):
-            assert(!"fpu_inst_fsincos;");
-        
-        default:
-            assert(!"inst_fmath: unknown instruction");
-    }
-    
-    // Finalize the read, if source was in memory
-    if (src_in_ea) {
-        fpu_read_ea_commit(M, source_specifier);
-    }
-    
-    // Only write back the result if necessary (fcmp doesn't do this)
-    if (do_write_back_result) {
-        slog("inst_fmath: result = %Lf\n", result);
-        fpu_set_reg_cc(result, dest_register);
+        const int16_t disp = nextword();
+        const uint16_t newd = get_d(r, 2) - 1;
+        set_d(r, newd, 2);
+        if (newd != 0xffff)
+            shoe.pc = shoe.orig_pc + 2 + disp;
     }
 }
 
+void inst_ftrapcc () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 001 111 xyz);
+    
+    // ftrapcc can throw an exception
+    fpu->fpiar = shoe.orig_pc;
+    
+    // (xyz) == (100) -> sz=0
+    // (xyz) == (010) -> sz=2
+    // (xyz) == (011) -> sz=4
+    const uint32_t sz = y << (z+1);
+    const uint32_t next_pc = shoe.orig_pc + 2 + sz;
+    
+    const uint16_t ext = nextword();
+    ~decompose(ext, 0000 0000 000 b cccc);
+    
+    /*
+     * inst_f*cc instructions throw a pre-instruction exception
+     * if b && cc_nan
+     */
+    if (b && _bsun_test())
+        return ;
+    
+    if (fpu_test_cc(c))
+        throw_frame_two(shoe.sr, next_pc, 7, shoe.orig_pc);
+    else
+        shoe.pc = next_pc;
+}
 
+void inst_fnop() {
+    // This is technically fbcc
+    inst_fbcc();
+}
 
+void inst_fpu_other () {
+    fpu_get_state_ptr();
+    ~decompose(shoe.op, 1111 001 000 MMMMMM);
+    
+    const uint16_t ext = nextword();
+    ~decompose(ext, ccc xxx yyy eeeeeee);
+    
+    switch (c) {
+        case 0: // Reg to reg
+            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
+            inst_fmath(ext);
+            return;
+            
+        case 1: // unused
+            _throw_illegal_instruction();
+            return;
+            
+        case 2: // Memory->reg & movec
+            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
+            inst_fmath(ext);
+            return;
+            
+        case 3: // reg->mem
+            fpu->fpiar = shoe.orig_pc; // fmove() can throw an exception
+            inst_fmove(ext);
+            return;
+            
+        case 4: // mem -> sys ctl registers
+        case 5: // sys ctl registers -> mem
+            // fmovem_control() cannot throw an FPU exception (don't modify fpiar)
+            inst_fmovem_control(ext);
+            return;
+            
+        case 6: // movem to fp registers
+        case 7: // movem to memory
+            // fmovem() cannot throw an FPU exception (don't modify fpiar)
+            inst_fmovem(ext);
+            return;
+    }
+    
+    assert(0); // never get here
+    return;
+}
 
-// Setup the jump table for fpu instructions
-// XXX: come up with a better, unified system for decoding instructions
-void fpu_setup_jump_table()
+#pragma mark FPU-state initialization and reset
+
+void fpu_initialize()
 {
-    uint32_t i;
-    
-    
-    fpu_inst_table[fpu_inst_fnop].emu = inst_fnop;
-    fpu_inst_table[fpu_inst_fnop].dis = dis_fnop;
-    
-    fpu_inst_table[fpu_inst_fbcc].emu = inst_fbcc;
-    fpu_inst_table[fpu_inst_fbcc].dis = dis_fbcc;
-    
-    fpu_inst_table[fpu_inst_fmovecr].emu = inst_fmovecr;
-    fpu_inst_table[fpu_inst_fmovecr].dis = dis_fmovecr;
-    
-    fpu_inst_table[fpu_inst_fmove].emu = inst_fmove;
-    fpu_inst_table[fpu_inst_fmove].dis = dis_fmove;
-    
-    fpu_inst_table[fpu_inst_fmovem].emu = inst_fmovem;
-    fpu_inst_table[fpu_inst_fmovem].dis = dis_fmovem;
-    
-    fpu_inst_table[fpu_inst_fmovem_control].emu = inst_fmovem_control;
-    fpu_inst_table[fpu_inst_fmovem_control].dis = dis_fmovem_control;
-    
-    fpu_inst_table[fpu_inst_frestore].emu = inst_frestore;
-    fpu_inst_table[fpu_inst_frestore].dis = dis_frestore;
-
-    fpu_inst_table[fpu_inst_fsave].emu = inst_fsave;
-    fpu_inst_table[fpu_inst_fsave].dis = dis_fsave;
-    
-    const fpu_inst_name_t _fmath[] = {
-        fpu_inst_fsincos,
-        fpu_inst_fint,
-        fpu_inst_fsinh,
-        fpu_inst_fintrz,
-        fpu_inst_flognp1,
-        fpu_inst_fetoxm1,
-        fpu_inst_ftanh,
-        fpu_inst_fatan,
-        fpu_inst_fatanh,
-        fpu_inst_fsin,
-        fpu_inst_ftan,
-        fpu_inst_fetox,
-        fpu_inst_ftwotox,
-        fpu_inst_ftentox,
-        fpu_inst_flogn,
-        fpu_inst_flog10,
-        fpu_inst_flog2,
-        fpu_inst_fcosh,
-        fpu_inst_facos,
-        fpu_inst_fcos,
-        fpu_inst_fgetexp,
-        fpu_inst_fgetman,
-        fpu_inst_fmod,
-        fpu_inst_fsgldiv,
-        fpu_inst_fsglmul,
-        fpu_inst_frem,
-        fpu_inst_fscale,
-        fpu_inst_fcmp,
-        fpu_inst_ftst,
-        fpu_inst_fabs,
-        fpu_inst_fadd,
-        fpu_inst_fdiv,
-        fpu_inst_fmul,
-        fpu_inst_fneg,
-        fpu_inst_fsqrt,
-        fpu_inst_fsub
-    };
-    
-    for (i=0; i < sizeof(_fmath) / sizeof(fpu_inst_name_t); i++) {
-        fpu_inst_table[_fmath[i]].emu = inst_fmath;
-        fpu_inst_table[_fmath[i]].dis = dis_fmath;
-    }
+    fpu_state_t *fpu = (fpu_state_t*)p_alloc(shoe.pool, sizeof(fpu_state_t));
+    memset(fpu, sizeof(fpu_state_t), 0);
+    shoe.fpu_state = fpu;
 }
 
-
-
+void fpu_reset()
+{
+    p_free(shoe.fpu_state);
+    fpu_initialize();
+}
diff --git a/core/newfpu.c b/core/newfpu.c
deleted file mode 100644
index da7e072..0000000
--- a/core/newfpu.c
+++ /dev/null
@@ -1,3105 +0,0 @@
-/*
- * Copyright (c) 2013-2014, Peter Rutenbar <pruten@gmail.com>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include "../core/shoebill.h"
-#include "../core/SoftFloat/softfloat.h"
-
-#pragma mark Structures and macros
-
-// Mode control byte
-#define mc_rnd  (fpu->fpcr.b._mc_rnd)
-#define mc_prec (fpu->fpcr.b._mc_prec)
-
-// Exception enable byte
-#define ee_inex1 (fpu->fpcr.b._ee_inex1)
-#define ee_inex2 (fpu->fpcr.b._ee_inex2)
-#define ee_dz    (fpu->fpcr.b._ee_dz)
-#define ee_unfl  (fpu->fpcr.b._ee_unfl)
-#define ee_ovfl  (fpu->fpcr.b._ee_ovfl)
-#define ee_operr (fpu->fpcr.b._ee_operr)
-#define ee_snan  (fpu->fpcr.b._ee_snan)
-#define ee_bsun  (fpu->fpcr.b._ee_bsun)
-
-// Accrued exception byte
-#define ae_inex (fpu->fpsr.b._ae_inex)
-#define ae_dz   (fpu->fpsr.b._ae_dz)
-#define ae_unfl (fpu->fpsr.b._ae_unfl)
-#define ae_ovfl (fpu->fpsr.b._ae_ovfl)
-#define ae_iop  (fpu->fpsr.b._ae_iop)
-
-// Exception status byte
-#define es_inex1 (fpu->fpsr.b._es_inex1)
-#define es_inex2 (fpu->fpsr.b._es_inex2)
-#define es_dz    (fpu->fpsr.b._es_dz)
-#define es_unfl  (fpu->fpsr.b._es_unfl)
-#define es_ovfl  (fpu->fpsr.b._es_ovfl)
-#define es_operr (fpu->fpsr.b._es_operr)
-#define es_snan  (fpu->fpsr.b._es_snan)
-#define es_bsun  (fpu->fpsr.b._es_bsun)
-
-// Quotient byte
-#define qu_quotient (fpu->fpsr.b._qu_quotient)
-#define qu_s        (fpu->fpsr.b._qu_s) /* quotient sign */
-
-// Condition codes
-#define cc_nan (fpu->fpsr.b._cc_nan)
-#define cc_i (fpu->fpsr.b._cc_i)
-#define cc_z (fpu->fpsr.b._cc_z)
-#define cc_n (fpu->fpsr.b._cc_n)
-
-
-typedef struct {
-    uint32_t fpiar; // FPU iaddr
-    
-    union { // fpcr, fpu control register
-        struct {
-            // Mode control byte
-            uint16_t _mc_zero : 4; // zero/dummy
-            uint16_t _mc_rnd  : 2; // rounding mode
-            uint16_t _mc_prec : 2; // rounding precision
-            // Exception enable byte
-            uint16_t _ee_inex1 : 1; // inexact decimal input
-            uint16_t _ee_inex2 : 1; // inxact operation
-            uint16_t _ee_dz    : 1; // divide by zero
-            uint16_t _ee_unfl  : 1; // underflow
-            uint16_t _ee_ovfl  : 1; // overflow
-            uint16_t _ee_operr : 1; // operand error
-            uint16_t _ee_snan  : 1; // signalling not a number
-            uint16_t _ee_bsun  : 1; // branch/set on unordered
-        } b;
-        
-        uint16_t raw;
-    } fpcr;
-    
-    union { // fpsr, fpu status register
-        struct {
-            // Accrued exception byte
-            uint32_t _dummy1  : 3; // dummy/zero
-            uint32_t _ae_inex : 1; // inexact
-            uint32_t _ae_dz   : 1; // divide by zero
-            uint32_t _ae_unfl : 1; // underflow
-            uint32_t _ae_ovfl : 1; // overflow
-            uint32_t _ae_iop  : 1; // invalid operation
-            // Exception status byte
-            uint32_t _es_inex1 : 1; // inexact decimal input
-            uint32_t _es_inex2 : 1; // inxact operation
-            uint32_t _es_dz    : 1; // divide by zero
-            uint32_t _es_unfl  : 1; // underflow
-            uint32_t _es_ovfl  : 1; // overflow
-            uint32_t _es_operr : 1; // operand error
-            uint32_t _es_snan  : 1; // signalling not a number
-            uint32_t _es_bsun  : 1; // branch/set on unordered
-            // Quotient byte
-            uint32_t _qu_quotient : 7;
-            uint32_t _qu_s        : 1;
-            // Condition code byte
-            uint32_t _cc_nan  : 1; // not a number
-            uint32_t _cc_i    : 1; // infinity
-            uint32_t _cc_z    : 1; // zero
-            uint32_t _cc_n    : 1; // negative
-            uint32_t _dummy2  : 4; // dummy/zero
-        } b;
-        uint32_t raw;
-    } fpsr;
-    
-    floatx80 fp[8]; // 80 bit floating point general registers
-    
-    // State for the static fmath instruction implementations
-    float128 source, dest, result;
-    _Bool write_back;
-    uint8_t fmath_op;
-} fpu_state_t;
-
-enum rounding_precision_t {
-    prec_extended = 0,
-    prec_single = 1,
-    prec_double = 2,
-};
-
-enum rounding_mode_t {
-    mode_nearest = 0,
-    mode_zero = 1,
-    mode_neg = 2,
-    mode_pos = 3
-};
-
-/*
- * 0 L     long word integer
- * 1 S     single precision real
- * 2 X     extended precision real
- * 3 P{#k} packed decimal real with static k factor
- * 4 W     word integer
- * 5 D     double precision real
- * 6 B     byte integer
- * 7 P{Dn} packed decimal real with dynamic k factor
- */
-static const uint8_t _format_sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
-enum {
-    format_L = 0,
-    format_S = 1,
-    format_X = 2,
-    format_Ps = 3,
-    format_W = 4,
-    format_D = 5,
-    format_B = 6,
-    format_Pd = 7
-} fpu_formats;
-
-#define fpu_get_state_ptr() fpu_state_t *fpu = (fpu_state_t*)shoe.fpu_state
-#define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
-#define nextlong() ({const uint32_t L=lget(shoe.pc,4); if (shoe.abort) {return;}; shoe.pc+=4; L;})
-#define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
-
-#pragma mark FPU exception stuff
-enum fpu_vector_t {
-    fpu_vector_ftrapcc = 7,
-    fpu_vector_fline = 11,
-    fpu_vector_coprocessor_protocol_violation = 13, // won't be using this one
-    fpu_vector_bsun = 48,
-    fpu_vector_inexact = 49,
-    fpu_vector_divide_by_zero = 50,
-    fpu_vector_underflow = 51,
-    fpu_vector_operr = 52,
-    fpu_vector_overflow = 53,
-    fpu_vector_snan = 54
-};
-
-/*
- * Map the exception bit positions (in fpsr and fpcr)
- * to their corresponding exception vector numbers.
- */
-const uint8_t _exception_bit_to_vector[8] = {
-    48, // bsun
-    54, // snan
-    52, // operr
-    53, // ovfl
-    51, // unfl
-    50, // dz
-    49, // inex2
-    49, // inex1
-};
-
-static void throw_fpu_pre_instruction_exception(enum fpu_vector_t vector)
-{
-    throw_frame_zero(shoe.orig_sr, shoe.orig_pc, vector);
-}
-/*
- * Note: I may be able to get away without implementing the
- *       mid-instruction exception.
- */
-
-/*
- * _bsun_test() is called by every inst_f*cc instruction
- * to test whether the bsun exception is enabled, throw an
- * exception if so, and otherwise just set the appropriate
- * bit in fpsr, and update the accrued exception byte.
- */
-static _Bool _bsun_test()
-{
-    fpu_get_state_ptr();
-    
-    // BSUN counts against the IOP accrued exception bit
-    ae_iop = 1;
-    
-    // Set the BSUN exception status bit
-    es_bsun = 1;
-    
-    // If the BSUN exception isn't enabled, then we can just return
-    if (!ee_bsun)
-        return 0; // 0 -> elected not to throw an exception
-    
-    throw_fpu_pre_instruction_exception(fpu_vector_bsun);
-    return 1;
-}
-
-static void _throw_illegal_instruction()
-{
-    assert(!"throw_illegal_instruction!");
-}
-
-#pragma mark Float format translators (to/from big-endian motorola format)
-
-static void _floatx80_to_int8(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = tmp & 0xff;
-}
-
-static void _floatx80_to_int16(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = (tmp >> 8) & 0xff;
-    ptr[1] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_int32(floatx80 *f, uint8_t *ptr)
-{
-    uint32_t tmp = floatx80_to_int32(*f);
-    ptr[0] = (tmp >> 24) & 0xff;
-    ptr[1] = (tmp >> 16) & 0xff;
-    ptr[2] = (tmp >> 8) & 0xff;
-    ptr[3] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_single(floatx80 *f, uint8_t *ptr)
-{
-    const float32 tmp = floatx80_to_float32(*f);
-    ptr[0] = (tmp >> 24) & 0xff;
-    ptr[1] = (tmp >> 16) & 0xff;
-    ptr[2] = (tmp >> 8) & 0xff;
-    ptr[3] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_double(floatx80 *f, uint8_t *ptr)
-{
-    const float64 tmp = floatx80_to_float64(*f);
-    ptr[0] = (tmp >> 56) & 0xff;
-    ptr[1] = (tmp >> 48) & 0xff;
-    ptr[2] = (tmp >> 40) & 0xff;
-    ptr[3] = (tmp >> 32) & 0xff;
-    ptr[4] = (tmp >> 24) & 0xff;
-    ptr[5] = (tmp >> 16) & 0xff;
-    ptr[6] = (tmp >> 8) & 0xff;
-    ptr[7] = (tmp >> 0) & 0xff;
-}
-
-static void _floatx80_to_extended(floatx80 *f, uint8_t *ptr)
-{
-    ptr[0] = (f->high >> 8) & 0xff;
-    ptr[1] = (f->high >> 0) & 0xff;
-    ptr[2] = 0;
-    ptr[3] = 0;
-    ptr[4] = (f->low >> 56) & 0xff;
-    ptr[5] = (f->low >> 48) & 0xff;
-    ptr[6] = (f->low >> 40) & 0xff;
-    ptr[7] = (f->low >> 32) & 0xff;
-    ptr[8] = (f->low >> 24) & 0xff;
-    ptr[9] = (f->low >> 16) & 0xff;
-    ptr[10] = (f->low >> 8) & 0xff;
-    ptr[11] = (f->low >> 0) & 0xff;
-}
-
-static float128 _int8_to_intermediate(int8_t byte)
-{
-    return int32_to_float128((int32_t)byte);
-}
-
-static float128 _int16_to_intermediate(int16_t sh)
-{
-    return int32_to_float128((int32_t)sh);
-}
-
-static float128 _int32_to_intermediate(int32_t in)
-{
-    return int32_to_float128(in);
-}
-
-static float128 _single_to_intermediate(uint32_t f)
-{
-    assert(sizeof(uint32_t) == sizeof(float32));
-    return float32_to_float128((float32)f);
-}
-
-/*
- * _double_to_intermediate(d): d needs to be 68k-native order (8 bytes)
- */
-static float128 _double_to_intermediate(uint8_t *d)
-{
-    assert(sizeof(uint64_t) == sizeof(float64));
-    
-    return float64_to_float128((float64) ntohll(*(uint64_t*)d));
-}
-
-/*
- * _extended_to_intermediate(e): e needs to be 68k-native order (12 bytes)
- */
-static float128 _extended_to_intermediate(uint8_t *e)
-{
-    /*
-     * softfloat floatx80 format:
-     * uint64_t low; // the low part of the extended float (significand, low exponent bits)
-     * uint16_t high; // the high part, sign, high exponent bits
-     */
-    floatx80 x80 = {
-        .high = (e[0] << 8) | e[1],
-        .low = ntohll(*(uint64_t*)&e[4])
-    };
-    return floatx80_to_float128(x80);
-}
-
-static void _extended_to_floatx80(uint8_t *bytes, floatx80 *f)
-{
-    f->high = (bytes[0] << 8) | bytes[1];
-    f->low = ntohll(*(uint64_t*)&bytes[4]);
-}
-
-/*
- * Set softfloat's rounding mode
- * (fpcr.mc_rnd and softfloat use different values for these modes)
- */
-static void _set_rounding_mode(enum rounding_mode_t mode)
-{
-    const int8_t rounding_map[4] = {
-        float_round_nearest_even, float_round_to_zero,
-        float_round_up, float_round_down
-    };
-    
-    float_rounding_mode = rounding_map[mode];
-}
-
-#pragma mark EA routines
-
-/*
- * Read-commit merely updates the address register
- * for pre/post-inc/decrement
- */
-static void _fpu_read_ea_commit(const uint8_t format)
-{
-    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
-    
-    if (m == 3) // post-increment
-        shoe.a[r] += _format_sizes[format];
-    else if (m == 4) // pre-decrement
-        shoe.a[r] -= _format_sizes[format];
-    
-    /* 
-     * Note: still unsure about what happens when
-     *       mode=pre/postincdecrement, size==1, and register==a7
-     *       (is the change +-2 bytes? or 1?)
-     */
-    if (((m == 3) || (m == 4)) && (_format_sizes[format] == 1) && (r == 7))
-        assert(!"size==1, reg==a7");
-}
-
-static void _fpu_write_ea(uint8_t mr, uint8_t format, floatx80 *f, uint8_t K)
-{
-    fpu_get_state_ptr();
-    
-    const uint8_t m = mr >> 3;
-    const uint8_t r = mr & 7;
-    const uint8_t size = _format_sizes[format];
-    uint8_t buf[12], *ptr = &buf[0];
-    uint32_t addr, i;
-    
-    if ((m == 1) ||
-        ((m == 0) && (size > 4))) {
-        /* If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice */
-        _throw_illegal_instruction();
-        return ;
-    }
-    else if ((m == 7) && (r > 1)) {
-        /* If this is otherwise an illegal addr mode... */
-        _throw_illegal_instruction();
-        return ;
-    }
-    
-    const _Bool is_nan = ((f->high << 1) == 0xfffe) && f->low;
-    
-    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, 666.0L, format);
-    
-    /* Initialize softfloat's exceptions bits/rounding mode */
-    
-    float_exception_flags = 0;
-    _set_rounding_mode(mc_rnd);
-    
-    /* Convert to the appropriate format */
-    
-    switch (format) {
-        case format_B: {
-            _floatx80_to_int8(f, ptr);
-            break;
-        }
-        case format_W: {
-            _floatx80_to_int16(f, ptr);
-            break;
-        }
-        case format_L: {
-            _floatx80_to_int32(f, ptr);
-            break;
-        }
-        case format_S: {
-            _floatx80_to_single(f, ptr);
-            break;
-        }
-        case format_D: {
-            _floatx80_to_double(f, ptr);
-            break;
-        }
-        case format_X: {
-            _floatx80_to_extended(f, ptr);
-            break;
-        }
-        default: {
-            assert(!"unsupported format (packed something!)");
-        }
-    }
-    
-    /* Write to memory */
-    
-    switch (m) {
-        case 0: {
-            if (format == format_B)
-                set_d(r, ptr[0], 1);
-            else if (format == format_W)
-                set_d(r, ntohs(*(uint16_t*)ptr), 2);
-            else if ((format == format_L) || (format == format_S))
-                set_d(r, ntohl(*(uint32_t*)ptr), 4);
-            else
-                assert(!"how did I get here?");
-            goto done;
-        }
-        case 1:
-            assert(!"how did I get here again!");
-            
-        case 2:
-            addr = shoe.a[r];
-            break;
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && size==1));
-            break;
-        case 4: // pre-decrement
-            addr = shoe.a[r] - size;
-            assert(!( r==7 && size==1));
-            break;
-        default:
-            call_ea_addr(mr);
-            addr = (uint32_t)shoe.dat;
-            break;
-    }
-    
-    /* Copy the formatted data into *addr */
-    
-    slog("inst_f  fpu_write_ea: addr=0x08x data=0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-         addr,
-         ptr[0], ptr[1], ptr[2], ptr[3],
-         ptr[4], ptr[5], ptr[6], ptr[7],
-         ptr[8], ptr[9], ptr[10], ptr[11]);
-    
-    for (i=0; i<size; i++) {
-        lset(addr + i, 1, buf[i]);
-        if (shoe.abort) return ;
-    }
-    
-    
-done:
-    /* 
-     * Set exception bits and update pre/post/blah registers.
-     * note: condition codes are not modified
-     */
-    
-    es_bsun = 0;
-    es_snan = 0;
-    es_operr = 0;
-    es_ovfl = 0;
-    es_unfl = 0;
-    es_dz = 0;
-    es_inex2 = 0;
-    es_inex1 = 0;
-    
-    switch (format) {
-        format_B:
-        format_W:
-        format_L:
-            /* Set snan, operr, and/or inex2 */
-            es_snan = is_nan;
-            es_operr = ((float_exception_flags & float_flag_invalid) != 0);
-            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
-            break;
-        
-        format_S:
-        format_D:
-        format_X:
-            /* Set snan, ovfl, unfl, and/or inex2 */
-            es_snan = is_nan;
-            es_ovfl = ((float_exception_flags & float_flag_overflow) != 0);
-            es_unfl = ((float_exception_flags & float_flag_underflow) != 0);
-            es_inex2 = ((float_exception_flags & float_flag_inexact) != 0);
-            break;
-            
-        format_Pd:
-        format_Ps:
-            /* Set snan, operr, and/or inex2 */
-            assert(!"you better implement packed formats");
-            break;
-    }
-    
-    /* Update the accrued exception bits */
-    ae_iop |= es_bsun | es_snan | es_operr;
-    ae_ovfl |= es_ovfl;
-    ae_unfl |= (es_unfl & es_inex2); // yes, &
-    ae_dz |= es_dz;
-    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
-    
-    /* Are any exceptions both set and enabled? */
-    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
-        /*
-         * Then we need to throw an exception.
-         * The exception is sent to the vector for
-         * the highest priority exception, and the priority
-         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
-         * (which is the order of the bits in fpsr/fpcr).
-         * Iterate over the bits in order, and throw the
-         * exception to whichever bit is set first.
-         */
-        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
-        
-        assert(throwable);
-        for (i=0; 1; i++) {
-            if (throwable & 0x80)
-                break;
-            throwable <<= 1;
-        }
-        
-        /*
-         * Convert the exception bit position
-         * to the correct vector number, and throw
-         * a (pre-instruction) exception.
-         */
-        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
-        
-        return ;
-    }
-    
-    /* Finalize registers, and we're done */
-    
-    if (m == 3)
-        shoe.a[r] += size;
-    else if (m == 4)
-        shoe.a[r] -= size;
-}
-
-/*
- * Note: fpu_read_ea modifies shoe.pc, and fpu_read_ea_commit
- *        modifies shoe.a[x] for pre/post-inc/decrement
- * Returns false if we're aborting
- */
-static _Bool _fpu_read_ea(const uint8_t format, float128 *result)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op, 0000 0000 00 mmmrrr);
-    
-    const uint8_t size = _format_sizes[format];
-    uint32_t addr = 0;
-    
-    /*
-     * Step 1: find the effective address, store it in addr
-     *         (or the actual data, if unavailable)
-     */
-    
-    slog("FPU: read_ea: mr=%u%u f=%c ", m, r, "lsxpwdb?"[format]);
-    
-    switch (m) {
-        case 0:
-            if (format == format_S)
-                *result = _single_to_intermediate(shoe.d[r]);
-            else if (format == format_B)
-                *result = _int8_to_intermediate(shoe.d[r] & 0xff);
-            else if (format == format_W)
-                *result = _int16_to_intermediate(shoe.d[r] & 0xffff);
-            else if (format == format_L)
-                *result = int32_to_float128(shoe.d[r]);
-            else {
-                /*
-                 * No other format can be used with a data register
-                 * (because they require >4 bytes)
-                 */
-                _throw_illegal_instruction();
-                return 0;
-            }
-            slog("raw=0x%x", chop(shoe.d[r], size));
-            goto got_data;
-            
-        case 1:
-            /* Address regisers can't be used */
-            _throw_illegal_instruction();
-            return 0;
-        
-        case 3:
-            addr = shoe.a[r];
-            assert(!( r==7 && size==1));
-            goto got_address;
-            
-        case 4:
-            addr = shoe.a[r] - size;
-            assert(!( r==7 && size==1));
-            goto got_address;
-            
-        case 7:
-            if (r == 4) {
-                addr = shoe.pc;
-                shoe.pc += size;
-                goto got_address;
-            }
-            
-            // fall through to default:
-            
-        default: {
-            ~decompose(shoe.op, 0000 0000 00 MMMMMM);
-            shoe.mr = M;
-            ea_addr();
-            if (shoe.abort)
-                return 0;
-            
-            addr = (uint32_t)shoe.dat;
-            goto got_address;
-        }
-
-    }
-    
-got_address:
-    
-    /*
-     * Step 2: Load the data from the effective address
-     */
-    
-    slog("raw=0x");
-    if (size <= 4) {
-        const uint32_t raw = lget(addr, size);
-        if (shoe.abort)
-            return 0;
-        printf("%x ", raw);
-        switch (format) {
-            case format_B:
-                *result = _int8_to_intermediate(raw & 0xff);
-                break;
-            case format_W:
-                *result = _int16_to_intermediate(raw & 0xffff);
-                break;
-            case format_L:
-                *result = _int32_to_intermediate(raw);
-                break;
-            case format_S:
-                *result = _single_to_intermediate(raw);
-                break;
-            default:
-                assert(0); /* never get here */
-        }
-    }
-    else { // if (size > 4) -> if format is double, extended, or packed
-        uint8_t buf[12];
-        uint32_t i;
-        
-        for (i = 0; i < size; i++) {
-            buf[i] = lget(addr + i, 1);
-            slog("%02x", buf[i]);
-            if (shoe.abort)
-                return 0;
-        }
-        
-        switch (format) {
-            case format_D:
-                *result = _double_to_intermediate(buf);
-                break;
-            case format_X:
-                *result = _extended_to_intermediate(buf);
-                break;
-            case format_Ps:
-            // case format_Pd: // not possible as a src specifier
-                // FIXME: implement packed formats
-                assert(!"Somebody tried to use a packed format!\n");
-                // _throw_illegal_instruction();
-                // return 0;
-            default:
-                assert(0); // never get here
-        }
-    }
-    
-got_data:
-    printf("\n");
-    return 1;
-}
-
-#pragma mark Hacky low-precision transcendental implementations
-/*
- * s -> sign, e -> biased exponent
- * ma -> 48 high bits of the mantissa
- * mb -> 64 low bits of the mantissa
- */
-#define _assemble_float128(s, e, ma, mb) ({ \
-    const uint64_t _ma = (ma), _mb = (mb); \
-    const uint64_t _e = (e), _s = (s); \
-    float128 f = { \
-        .high = ((_s != 0) << 16) | (_e & 0x7fff), \
-        .low = _mb \
-    }; \
-    f.high = ((f.high) << 48) | _ma; \
-    f; \
-})
-
-#define HACKY_MATH_X86
-#ifdef HACKY_MATH_X86
-#define NATIVE double
-
-double _to_native(float128 f128)
-{
-    float64 f64 = float128_to_float64(f128);
-    double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    ptr[7] = (f64 >> 56) & 0xff;
-    ptr[6] = (f64 >> 48) & 0xff;
-    ptr[5] = (f64 >> 40) & 0xff;
-    ptr[4] = (f64 >> 32) & 0xff;
-    ptr[3] = (f64 >> 24) & 0xff;
-    ptr[2] = (f64 >> 16) & 0xff;
-    ptr[1] = (f64 >> 8) & 0xff;
-    ptr[0] = (f64 >> 0) & 0xff;
-    return result;
-}
-
-float128 _from_native(double n)
-{
-    float64 f64 = 0;
-    uint8_t *ptr = (uint8_t*)&n;
-    f64 = (f64 << 8) | ptr[7];
-    f64 = (f64 << 8) | ptr[6];
-    f64 = (f64 << 8) | ptr[5];
-    f64 = (f64 << 8) | ptr[4];
-    f64 = (f64 << 8) | ptr[3];
-    f64 = (f64 << 8) | ptr[2];
-    f64 = (f64 << 8) | ptr[1];
-    f64 = (f64 << 8) | ptr[0];
-    return float64_to_float128(f64);
-}
-
-#include <math.h>
-#define _native_cos(a) cos(a)
-#define _native_acos(a) acos(a)
-#define _native_cosh(a) cosh(a)
-#define _native_sin(a) sin(a)
-#define _native_asin(a) asin(a)
-#define _native_sinh(a) sinh(a)
-#define _native_tan(a) tan(a)
-#define _native_atan(a) atan(a)
-#define _native_tanh(a) tanh(a)
-#define _native_atanh(a) atanh(a)
-#define _native_pow(a, b) pow((a), (b))
-#define _native_exp(a) exp(a)
-#define _native_expm1(a) (exp(a) - 1.0) /* or expm1() */
-#define _native_log10(a) log10(a)
-#define _native_log2(a) (log(a) / log(2.0)) /* or log2() */
-#define _native_log(a) log(a)
-#define _native_log1p(a) log((a) + 1.0) /* or log1p() */
-
-const double _native_e = 2.71828182845904509;
-const double _native_10 = 10.0;
-const double _native_2 = 2.0;
-const double _native_1 = 1.0;
-
-#elif (defined(HACKY_MATH_PPC))
-#error "PowerPC hacky math isn't implemented yet"
-#else
-#error "You need to define HACKY_MATH_X86, or implement one for your arch"
-#endif
-
-static float128 _hack_cos (float128 x) {
-    return _from_native(_native_cos(_to_native(x)));
-}
-
-static float128 _hack_acos (float128 x) {
-    return _from_native(_native_acos(_to_native(x)));
-}
-
-static float128 _hack_cosh (float128 x) {
-    return _from_native(_native_cosh(_to_native(x)));
-}
-
-static float128 _hack_sin (float128 x) {
-    return _from_native(_native_sin(_to_native(x)));
-}
-
-static float128 _hack_asin (float128 x) {
-    return _from_native(_native_asin(_to_native(x)));
-}
-
-static float128 _hack_sinh (float128 x) {
-    return _from_native(_native_sinh(_to_native(x)));
-}
-
-static float128 _hack_tan (float128 x) {
-    return _from_native(_native_tan(_to_native(x)));
-}
-
-static float128 _hack_atan (float128 x) {
-    return _from_native(_native_atan(_to_native(x)));
-}
-
-static float128 _hack_tanh (float128 x) {
-    return _from_native(_native_tanh(_to_native(x)));
-}
-
-static float128 _hack_atanh (float128 x) {
-    return _from_native(_native_atanh(_to_native(x)));
-}
-
-static float128 _hack_etox (float128 x) {
-    return _from_native(_native_exp(_to_native(x)));
-}
-
-static float128 _hack_etoxm1 (float128 x) {
-    return _from_native(_native_expm1(_to_native(x)));
-}
-
-static float128 _hack_log10 (float128 x) {
-    return _from_native(_native_log10(_to_native(x)));
-}
-
-static float128 _hack_log2 (float128 x) {
-    return _from_native(_native_log2(_to_native(x)));
-}
-
-static float128 _hack_logn (float128 x) {
-    return _from_native(_native_log(_to_native(x)));
-}
-
-static float128 _hack_lognp1 (float128 x) {
-    return _from_native(_native_log1p(_to_native(x)));
-}
-
-static float128 _hack_tentox (float128 x) {
-    return _from_native(_native_pow(_native_10, _to_native(x)));
-}
-
-static float128 _hack_twotox (float128 x) {
-    return _from_native(_native_pow(_native_2, _to_native(x)));
-}
-
-#pragma mark FMATH! and all its helpers
-
-/* Function prototypes from SoftFloat/softfloat-specialize.h */
-char float128_is_nan(float128 a);
-char float128_is_signaling_nan (float128 a);
-
-
-static void inst_fmath_fmovecr (void)
-{
-    fpu_get_state_ptr();
-
-    /*
-     * FYI: these constants are stored in the "intermediate" 85-bit
-     *      format in the 6888x rom. This has the side effect that
-     *      they are rounded according to fpcr.mc_rnd.
-     *      We emulate the intermediate 85-bit format with float128.
-     */
-    
-    switch (fpu->fmath_op) {
-        case 0x00: // pi
-            fpu->result = _assemble_float128(0, 0x4000, 0x921fb54442d1, 0x8469898cc51701b8);
-            break;
-        case 0x0b: // log_10(2)
-            fpu->result = _assemble_float128(0, 0x3ffd, 0x34413509f79f, 0xef311f12b35816f9);
-            break;
-        case 0x0c: // e
-            fpu->result = _assemble_float128(0, 0x4000, 0x5bf0a8b14576, 0x95355fb8ac404e7a);
-            break;
-        case 0x0d: // log_2(e)
-            fpu->result = _assemble_float128(0, 0x3fff, 0x71547652b82f, 0xe1777d0ffda0d23a);
-            break;
-        case 0x0e: // log_10(e)
-            // NOTE: 68881 doesn't set inex2 for this one
-            // Also note: that's bogus. 68881 uses 3 trailing mantissa bits to do rounding,
-            // and those bits are non-zero for this number, so it must actually be stored
-            // incorrectly in the ROM.
-            // I'll emulate this by truncating the float128 mantissa.
-            
-            // fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a6ab7555f5a67);
-            fpu->result = _assemble_float128(0, 0x3ffd, 0xbcb7b1526e50, 0xe32a000000000000);
-            break;
-        case 0x0f: // 0.0
-            fpu->result = _assemble_float128(0, 0, 0, 0);
-            break;
-        case 0x30: // ln(2)
-            fpu->result = _assemble_float128(0, 0x3ffe, 0x62e42fefa39e, 0xf35793c7673007e5);
-            break;
-        case 0x31: // ln(10)
-            fpu->result = _assemble_float128(0, 0x4000, 0x26bb1bbb5551, 0x582dd4adac5705a6);
-            break;
-        case 0x32: // 1 (68kprm has typesetting issues everywhere. This one says 100, but means 10^0.)
-            fpu->result = _assemble_float128(0, 0x3fff, 0x0, 0x0);
-            break;
-        case 0x33: // 10
-            fpu->result = _assemble_float128(0, 0x4002, 0x400000000000, 0x0);
-            break;
-        case 0x34: // 10^2
-            fpu->result = _assemble_float128(0, 0x4005, 0x900000000000, 0x0);
-            break;
-        case 0x35: // 10^4
-            fpu->result = _assemble_float128(0, 0x400c, 0x388000000000, 0x0);
-            break;
-        case 0x36: // 10^8
-            fpu->result = _assemble_float128(0, 0x4019, 0x7d7840000000, 0x0);
-            break;
-        case 0x37: // 10^16
-            fpu->result = _assemble_float128(0, 0x4034, 0x1c37937e0800, 0x0);
-            break;
-        case 0x38: // 10^32
-            fpu->result = _assemble_float128(0, 0x4069, 0x3b8b5b5056e1, 0x6b3be04000000000);
-            break;
-        case 0x39: // 10^64
-            fpu->result = _assemble_float128(0, 0x40d3, 0x84f03e93ff9f, 0x4daa797ed6e38ed6);
-            break;
-        case 0x3a: // 10^128
-            fpu->result = _assemble_float128(0, 0x41a8, 0x27748f9301d3, 0x19bf8cde66d86d62);
-            break;
-        case 0x3b: // 10^256
-            fpu->result = _assemble_float128(0, 0x4351, 0x54fdd7f73bf3, 0xbd1bbb77203731fd);
-            break;
-        case 0x3c: // 10^512
-            fpu->result = _assemble_float128(0, 0x46a3, 0xc633415d4c1d, 0x238d98cab8a978a0);
-            break;
-        case 0x3d: // 10^1024
-            fpu->result = _assemble_float128(0, 0x4d48, 0x92eceb0d02ea, 0x182eca1a7a51e316);
-            break;
-        case 0x3e: // 10^2048
-            fpu->result = _assemble_float128(0, 0x5a92, 0x3d1676bb8a7a, 0xbbc94e9a519c6535);
-            break;
-        case 0x3f: // 10^4096
-            fpu->result = _assemble_float128(0, 0x7525, 0x88c0a4051441, 0x2f3592982a7f0094);
-            break;
-        default:
-            /*
-             * I wanted to include the actual values for the other ROM offsets,
-             * but they might be proprietary. Most of them are 0 anyways, and some
-             * cause FPU exceptions, even with all exceptions disabled... (?)
-             * 68040 FPSP just returns 0, so we'll do that too.
-             */
-            fpu->result = _assemble_float128(0, 0, 0, 0);
-            return ;
-    }
-}
-
-/*
- * This is quick macro.pl macro to build a jump table
- * for fmath instructions. It is probably slightly slower 
- * to use a jump table rather than a big switch statement,
- * but I think it looks cleaner.
- */
-~newmacro(create_fmath_jump_table, 0, {
-    my $name_map = {};
-    my $op_map = {};
-    my $add = sub {
-        my $op = shift;
-        my $name = lc(shift);
-        my $mode = 'foo';
-        my $arch = 68881;
-        
-        foreach my $arg (@_) {
-            if (($arg eq 'monadic') or ($arg eq 'dyadic')) {
-                $mode = $arg;
-            }
-            elsif ($arg == 68040) {
-                $arch = $arg;
-            }
-            else {
-                croak("bad arg $arg");
-            }
-        }
-        
-        croak("didn't specify mode") if ($mode eq "foo");
-        croak("dup $op $name") if exists $op_map->{$op};
-        croak("bogus") if ($op > 127);
-        
-        $op_map->{$op} = {op => $op, name => $name, mode => $mode, arch => $arch};
-        $name_map->{$name} = $op_map->{$op};
-    };
-    
-    $add->(~b(1000000), 'fmove',    'monadic', 68040);
-    $add->(~b(1000100), 'fmove',    'monadic', 68040);
-    $add->(~b(0000000), 'fmove',    'monadic');
-    
-    $add->(~b(0000001), 'fint',     'monadic');
-    $add->(~b(0000010), 'fsinh',    'monadic');
-    $add->(~b(0000011), 'fintrz',   'monadic');
-    
-    $add->(~b(1000001), 'fsqrt',    'monadic', 68040);
-    $add->(~b(1000101), 'fsqrt',    'monadic', 68040);
-    $add->(~b(0000100), 'fsqrt',    'monadic');
-    
-    $add->(~b(0000110), 'flognp1',  'monadic');
-    $add->(~b(0001000), 'fetoxm1',  'monadic');
-    $add->(~b(0001001), 'ftanh',    'monadic');
-    $add->(~b(0001010), 'fatan',    'monadic');
-    $add->(~b(0001100), 'fasin',    'monadic');
-    $add->(~b(0001101), 'fatanh',   'monadic');
-    $add->(~b(0001110), 'fsin',     'monadic');
-    $add->(~b(0001111), 'ftan',     'monadic');
-    $add->(~b(0010000), 'fetox',    'monadic');
-    $add->(~b(0010001), 'ftwotox',  'monadic');
-    $add->(~b(0010010), 'ftentox',  'monadic');
-    $add->(~b(0010100), 'flogn',    'monadic');
-    $add->(~b(0010101), 'flog10',   'monadic');
-    $add->(~b(0010110), 'flog2',    'monadic');
-    
-    $add->(~b(1011000), 'fabs',     'monadic', 68040);
-    $add->(~b(1011100), 'fabs',     'monadic', 68040);
-    $add->(~b(0011000), 'fabs',     'monadic');
-    
-    $add->(~b(0011001), 'fcosh',    'monadic');
-    
-    $add->(~b(1011010), 'fneg',     'monadic', 68040);
-    $add->(~b(1011110), 'fneg',     'monadic', 68040);
-    $add->(~b(0011010), 'fneg',     'monadic');
-    
-    $add->(~b(0011100), 'facos',    'monadic');
-    $add->(~b(0011101), 'fcos',     'monadic');
-    $add->(~b(0011110), 'fgetexp',  'monadic');
-    $add->(~b(0011111), 'fgetman',  'monadic');
-    
-    $add->(~b(1100000), 'fdiv',     'dyadic', 68040);
-    $add->(~b(1100100), 'fdiv',     'dyadic', 68040);
-    $add->(~b(0100000), 'fdiv',     'dyadic');
-    
-    $add->(~b(0100001), 'fmod',     'dyadic');
-    
-    $add->(~b(1100010), 'fadd',     'dyadic', 68040);
-    $add->(~b(1100110), 'fadd',     'dyadic', 68040);
-    $add->(~b(0100010), 'fadd',     'dyadic');
-    
-    $add->(~b(1100011), 'fmul',     'dyadic', 68040);
-    $add->(~b(1100111), 'fmul',     'dyadic', 68040);
-    $add->(~b(0100011), 'fmul',     'dyadic');
-    
-    $add->(~b(0100100), 'fsgldiv',  'dyadic');
-    $add->(~b(0100101), 'frem',     'dyadic');
-    $add->(~b(0100110), 'fscale',   'dyadic');
-    $add->(~b(0100111), 'fsglmul',  'dyadic');
-    
-    $add->(~b(1101000), 'fsub',     'dyadic', 68040);
-    $add->(~b(1101100), 'fsub',     'dyadic', 68040);
-    $add->(~b(0101000), 'fsub',     'dyadic');
-    
-    $add->(~b(0110000), 'fsincos',  'monadic');
-    $add->(~b(0110001), 'fsincos',  'monadic');
-    $add->(~b(0110010), 'fsincos',  'monadic');
-    $add->(~b(0110011), 'fsincos',  'monadic');
-    $add->(~b(0110100), 'fsincos',  'monadic');
-    $add->(~b(0110101), 'fsincos',  'monadic');
-    $add->(~b(0110110), 'fsincos',  'monadic');
-    $add->(~b(0110111), 'fsincos',  'monadic');
-    
-    
-    $add->(~b(0111000), 'fcmp',     'dyadic');
-    $add->(~b(0111010), 'ftst',     'monadic');
-    
-    my $map_str = "fmath_impl_t *_fmath_map[128] = {\n";
-    my @inst_flags = (0) x 128;
-    
-    for (my $i=0; $i < 128; $i++) {
-        my $func_ptr = "NULL";
-        if (exists $op_map->{$i}) {
-            $func_ptr = 'inst_fmath_' . $op_map->{$i}->{name};
-            if ($op_map->{$i}->{mode} eq 'dyadic') {
-                $inst_flags[$i] |= 1;
-            }
-            if ($op_map->{$i}->{arch} == 68040) {
-                $inst_flags[$i] |= 2;
-            }
-        }
-
-        $map_str .= "\t" . $func_ptr . ",\n";
-    }
-    $map_str .= "};\n\nuint8_t _fmath_flags[128] = {\n";
-    
-    for (my $i=0; $i < 128; $i++) {
-        $map_str .= "\t" . sprintf('0x%02x', $inst_flags[$i]) . ",\n";
-    }
-    $map_str .= "};\n";
-    
-    $map_str .= "const char *_fmath_names[128] = {\n";
-    for (my $i=0; $i < 128; $i++) {
-        my $name = "f???";
-        if (exists $op_map->{$i}) {
-            $name = $op_map->{$i}->{name};
-        }
-        $map_str .= "\t\"" . $name . "\",\n";
-    }
-    $map_str .= "};\n";
-    
-    return $map_str;
-})
-
-static _Bool _float128_is_zero (float128 f)
-{
-    return ((f.high << 1) == 0) && (f.low == 0);
-}
-
-static _Bool _float128_is_neg (float128 f)
-{
-    return f.high >> 63;
-}
-
-static _Bool _float128_is_infinity (float128 f)
-{
-    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
-    const uint64_t frac_b = f.low;
-    const uint16_t exp = (f.high >> 48) & 0x7fff;
-    
-    return (exp == 0x7fff) && ((frac_a | frac_b) == 0);
-}
-
-static _Bool _float128_is_nan (float128 f)
-{
-    const uint64_t frac_a = f.high & 0x0000ffffffffffff;
-    const uint64_t frac_b = f.low;
-    const uint16_t exp = (f.high >> 48) & 0x7fff;
-    
-    return (exp == 0x7fff) && ((frac_a | frac_b) != 0);
-}
-
-const float128 _nan128 = {
-    .high = 0xFFFF800000000000ULL,
-    .low = 0
-};
-
-const float128 _one128 = {
-    .high = 0x3fff000000000000ULL,
-    .low = 0
-};
-
-const float128 _zero128 = {
-    .high = 0,
-    .low = 0
-};
-
-static void inst_fmath_fabs ()
-{
-    fpu_get_state_ptr();
-    
-    /* Clear the sign bit */
-    fpu->result = fpu->source;
-    fpu->result.high <<= 1;
-    fpu->result.high >>= 1;
-}
-
-static void inst_fmath_facos ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* Find the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is zero, result is +pi/2 */
-    if (source_zero) {
-        fpu->result = _assemble_float128(0, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
-        return;
-    }
-    /* If source isn't in range [-1, 1], return nan, set operr */
-    else if (!float128_le(tmp, _one128)) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_acos(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fadd ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_add(fpu->dest, fpu->source);
-    
-    /* 
-     * Throw operr (and return NaN) if operands are infinities
-     * with opposite signs. (I *think* softfloat is doing this
-     * corectly - the code's hard to read.)
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fasin ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* Find the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source isn't in range [-1, 1], return nan, set operr */
-    else if (!float128_le(tmp, _one128)) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_asin(fpu->source);
-    /* Set inex2?? */
-    /* Set unfl?? */
-}
-
-static void inst_fmath_fatan ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is +-pi/2 */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(source_sign, 0x3fff, 0x921fb54442d1, 0x8469898cc51701b8);
-        return ;
-    }
-    
-    fpu->result = _hack_atan(fpu->source);
-    /* Set inex2?? */
-    /* Set unfl?? */
-}
-
-static void inst_fmath_fatanh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* Take the absolute value of source */
-    float128 tmp = fpu->source;
-    tmp.high <<= 1;
-    tmp.high >>= 1;
-    
-    /* If source is 0, return source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If |source| == 1.0, set dz, return +-inf */
-    else if (float128_eq(tmp, _one128)) {
-        es_dz = 1;
-        fpu->result = _assemble_float128(source_sign, 0x7fff, 0, 0);
-        return;
-    }
-    /* If |source| > 1.0, set operr, return nan */
-    else if (!float128_le(tmp, _one128)) {
-        es_operr = 1;
-        fpu->result = _nan128;
-        return ;
-    }
-    
-    fpu->result = _hack_atanh(fpu->source);
-}
-
-static void inst_fmath_fcmp ()
-{
-    fpu_get_state_ptr();
-    
-    /* Don't write the result back to the register */
-    fpu->write_back = 0;
-    
-    fpu->result = float128_sub(fpu->dest, fpu->source);
-    
-    /*
-     * The 68881 docs say fcmp doesn't throw any exceptions
-     * based on the result, but I'm not sure I believe it.
-     
-     if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-     
-     if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-     */
-}
-
-static void inst_fmath_fcos ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_cos(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fcosh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return;
-    }
-    /* If source is +/- inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(0, 0x7fff, 0, 0);
-        return;
-    }
-    
-    fpu->result = _hack_cosh(fpu->source);
-}
-
-static void inst_fmath_fdiv ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_div(fpu->dest, fpu->source);
-    
-    /* Throw operr (and return NaN) if both operands are zero */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw divide-by-zero if dividend is zero */
-    if (float_exception_flags & float_flag_divbyzero)
-        es_dz = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fetox ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = _zero128;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_etox(fpu->source);
-}
-
-static void inst_fmath_fetoxm1 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = negone;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_etoxm1(fpu->source);
-}
-
-static void inst_fmath_fgetexp ()
-{
-    fpu_get_state_ptr();
-    
-    /* If source is INF, set operr and return NaN */
-    if (((fpu->source.high << 1) == 0xfffe000000000000ULL) && (fpu->source.low == 0)) {
-        es_operr = 1;
-        fpu->result.high = 0xffff000000000000ULL;
-        fpu->result.low = 0xc000000000000000ULL;
-        return ;
-    }
-    
-    /*
-     * If source is 0, return source.
-     * According to 68881 docs, the result needs to have 
-     * the same sign as the source (why?)
-     */
-    if (((fpu->source.high << 1) == 0) && (fpu->source.low == 0)) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    /*
-     * Otherwise, extract the biased exponent, convert it
-     * to a two's complement integer, and store that value
-     * as a float.
-     */
-    const uint32_t biased = (fpu->source.high << 1) >> 49;
-    fpu->result = int32_to_float128(((int32_t)biased) - 16383);
-}
-
-static void inst_fmath_fgetman ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fgetman not implemented");
-}
-
-static void inst_fmath_fint ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_round_to_int(fpu->source);
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-}
-
-static void inst_fmath_fintrz ()
-{
-    fpu_get_state_ptr();
-    
-    /* Same as fint, but force the round-to-zero mode */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    float_rounding_mode = float_round_to_zero;
-    fpu->result = float128_round_to_int(fpu->source);
-    float_rounding_mode = old_round_mode;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-}
-
-static void inst_fmath_flog10 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_log10(fpu->source);
-}
-
-static void inst_fmath_flog2 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_log2(fpu->source);
-}
-
-static void inst_fmath_flognp1 ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    const float128 negone = _assemble_float128(1, 0x3fff, 0, 0);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is -1.0, set dz, result is -inf */
-    else if (float128_eq(negone, fpu->source)) {
-        es_dz = 1;
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        return;
-    }
-    /* If source < -1.0, set operr, result is nan */
-    else if (float128_lt(fpu->source, negone)) {
-        es_operr = 1;
-        fpu->result = _nan128;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_lognp1(fpu->source);
-}
-
-static void inst_fmath_flogn ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, set dz, result is -inf */
-    if (source_zero) {
-        fpu->result = _assemble_float128(1, 0x7fff, 0, 0);
-        es_dz = 1;
-        return;
-    }
-    /* If source is negative, set operr, result is nan */
-    else if (source_sign) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return;
-    }
-    /* If source is +inf, result is +inf. */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_logn(fpu->source);
-}
-
-static void inst_fmath_fmove ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = fpu->source;
-}
-
-static void inst_fmath_fmul ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_mul(fpu->dest, fpu->source);
-    
-    /*
-     * Throw operr (and return NaN) if one operand is infinity
-     * and the other is zero.
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fneg ()
-{
-    fpu_get_state_ptr();
-    
-    /* Flip the sign bit */
-    fpu->result = fpu->source;
-    fpu->result.high ^= (1ULL << 63);
-    
-    /* 
-     * FIXME: you're supposed to throw UNFL if this is a
-     *        denormalized number, I think.
-     */
-}
-
-static void inst_fmath_frem ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool dest_zero = _float128_is_zero(fpu->dest);
-    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
-    
-    /* I just assume the quotient/sign are 0 for the following cases */
-    qu_quotient = 0;
-    qu_s = 0;
-    
-    /* If source is zero, result is nan */
-    if (source_zero) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If dest (but not source) is zero, result is that zero */
-    else if (dest_zero) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    /* If dest is infinity, result is nan */
-    else if (dest_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If source, but not dest, is infinity, result is dest */
-    else if (source_inf) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    
-    /* -- We're past the edge cases, do the actual op -- */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    
-    /* frem uses round-to-nearest */
-    float_rounding_mode = float_round_nearest_even;
-    
-    float128 N = float128_div(fpu->dest, fpu->source);
-    N = float128_round_to_int(N);
-    
-    float_rounding_mode = old_round_mode;
-    
-    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
-    
-    /* FIXME: not sure how to set unfl reliably */
-    
-    _Bool sign = N.high >> 63; /* Remember the sign */
-    N.high <<= 1; /* Clear the sign */
-    N.high >>= 1;
-    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
-    qu_quotient = final & 0x7f;
-    qu_s = sign;
-}
-
-static void inst_fmath_fmod ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool dest_zero = _float128_is_zero(fpu->dest);
-    const _Bool dest_inf = _float128_is_infinity(fpu->dest);
-    
-    /* I just assume the quotient/sign are 0 for the following cases */
-    qu_quotient = 0;
-    qu_s = 0;
-    
-    /* If source is zero, result is nan */
-    if (source_zero) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If dest (but not source) is zero, result is that zero */
-    else if (dest_zero) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    /* If dest is infinity, result is nan */
-    else if (dest_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    /* If source, but not dest, is infinity, result is dest */
-    else if (source_inf) {
-        fpu->result = fpu->dest;
-        return ;
-    }
-    
-    /* -- We're past the edge cases, do the actual op -- */
-    
-    const signed char old_round_mode = float_rounding_mode;
-    
-    /* fmod uses round-to-zero */
-    float_rounding_mode = float_round_to_zero;
-    
-    float128 N = float128_div(fpu->dest, fpu->source);
-    N = float128_round_to_int(N);
-    
-    float_rounding_mode = old_round_mode;
-    
-    fpu->result = float128_sub(fpu->dest, float128_mul(fpu->source, N));
-    
-    /* FIXME: not sure how to set unfl reliably */
-    
-    _Bool sign = N.high >> 63; /* Remember the sign */
-    N.high <<= 1; /* Clear the sign */
-    N.high >>= 1;
-    uint32_t final = float128_to_int32(N); /* Get the integer of the quotient */
-    qu_quotient = final & 0x7f;
-    qu_s = sign;
-}
-
-static void inst_fmath_fscale ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fscale not implemented");
-}
-
-static void inst_fmath_fsgldiv ()
-{
-    fpu_get_state_ptr();
-    
-    float128 source = fpu->source;
-    float128 dest = fpu->dest;
-    
-    /* Dump the low 88 bits of the source/dest mantissas */
-    source.low = 0;
-    source.high &= 0xffffffffff000000;
-    dest.low = 0;
-    dest.high &= 0xffffffffff000000;
-    
-    fpu->result = float128_div(dest, source);
-    
-    /* Throw operr (and return NaN) if both operands are zero */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw divide-by-zero if dividend is zero */
-    if (float_exception_flags & float_flag_divbyzero)
-        es_dz = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fsglmul ()
-{
-    fpu_get_state_ptr();
-    
-    /*
-     * As far as I can tell, fsglmul/fsgldiv use an ALU
-     * for the mantissa that is only 24-bits wide. Everything
-     * else is done with regular internal precision.
-     */
-    
-    float128 source = fpu->source;
-    float128 dest = fpu->dest;
-    
-    /* Dump the low 88 bits of the source/dest mantissas */
-    source.low = 0;
-    source.high &= 0xffffffffff000000;
-    dest.low = 0;
-    dest.high &= 0xffffffffff000000;
-    
-    fpu->result = float128_mul(dest, source);
-    
-    /*
-     * Throw operr (and return NaN) if one operand is infinity
-     * and the other is zero.
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_fsin ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_sin(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_fsincos ()
-{
-    fpu_get_state_ptr();
-    
-    assert(!"fmath: fsincos not implemented");
-}
-
-static void inst_fmath_fsinh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero or inf, return source */
-    if (source_zero || source_inf) {
-        fpu->result = fpu->source;
-        return;
-    }
-    
-    fpu->result = _hack_sinh(fpu->source);
-}
-
-static void inst_fmath_fsqrt ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_sqrt(fpu->source);
-    
-    /* Throw operr (and return NaN) if the operand is < 0 */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-}
-
-static void inst_fmath_fsub ()
-{
-    fpu_get_state_ptr();
-    
-    fpu->result = float128_sub(fpu->dest, fpu->source);
-    
-    /*
-     * Throw operr (and return NaN) if operands are infinities
-     * with equal signs. (I *think* softfloat is doing this
-     * corectly - the code's hard to read.)
-     *
-     * Both 68kprm and 68881 docs say that (+inf) - (-inf) = (-inf)
-     * but I presume that's a typo, and it's supposed to be (+inf)
-     */
-    if (float_exception_flags & float_flag_invalid)
-        es_operr = 1;
-    
-    /* Throw inex2 if the result is inexact */
-    if (float_exception_flags & float_flag_inexact)
-        es_inex2 = 1;
-    
-    /* Throw ovfl if the op overflowed */
-    if (float_exception_flags & float_flag_overflow)
-        es_ovfl = 1;
-    
-    /* Throw unfl if the op overflowed */
-    if (float_exception_flags & float_flag_underflow)
-        es_unfl = 1;
-}
-
-static void inst_fmath_ftan ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is inf, result is nan, and set operr */
-    else if (source_inf) {
-        fpu->result = _nan128;
-        es_operr = 1;
-        return ;
-    }
-    
-    fpu->result = _hack_tan(fpu->source);
-    /* Set inex2?? */
-}
-
-static void inst_fmath_ftanh ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is source */
-    if (source_zero) {
-        fpu->result = fpu->source;
-        return;
-    }
-    /* If source is +/- inf, result is +/- 1.0 */
-    else if (source_inf) {
-        fpu->result = _assemble_float128(source_sign, 0x3fff, 0, 0);
-        return;
-    }
-    
-    fpu->result = _hack_tanh(fpu->source);
-}
-
-static void inst_fmath_ftentox ()
-{
-    fpu_get_state_ptr();
-    
-    // _hack_ftentox() is broken on clang 3.5 on osx 10.10
-    // (tries to optimize pow(10.0, x) to __exp10(x), and __exp10
-    //  isn't implemented in the 10.8 SDK)
-//    const _Bool source_zero = _float128_is_zero(fpu->source);
-//    const _Bool source_inf = _float128_is_infinity(fpu->source);
-//    const _Bool source_sign = _float128_is_neg(fpu->source);
-//    
-//    /* If source is zero, result is +1.0 */
-//    if (source_zero) {
-//        fpu->result = _one128;
-//        return ;
-//    }
-//    /* if source is -inf, result is +0.0 */
-//    else if (source_inf && source_sign) {
-//        fpu->result = _zero128;
-//        return ;
-//    }
-//    /* if source is +inf, result is +inf */
-//    else if (source_inf) {
-//        fpu->result = fpu->source;
-//        return ;
-//    }
-//    
-//    fpu->result = _hack_tentox(fpu->source);
-//    
-    assert(!"fmath: ftentox not implemented");
-}
-
-static void inst_fmath_ftst ()
-{
-    fpu_get_state_ptr();
-    
-    /* Don't write the result back to the register */
-    fpu->write_back = 0;
-    
-    /* ftst just sets the cond codes according to the source */
-    fpu->result = fpu->source;
-}
-
-static void inst_fmath_ftwotox ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool source_zero = _float128_is_zero(fpu->source);
-    const _Bool source_inf = _float128_is_infinity(fpu->source);
-    const _Bool source_sign = _float128_is_neg(fpu->source);
-    
-    /* If source is zero, result is +1.0 */
-    if (source_zero) {
-        fpu->result = _one128;
-        return ;
-    }
-    /* if source is -inf, result is +0.0 */
-    else if (source_inf && source_sign) {
-        fpu->result = _zero128;
-        return ;
-    }
-    /* if source is +inf, result is +inf */
-    else if (source_inf) {
-        fpu->result = fpu->source;
-        return ;
-    }
-    
-    fpu->result = _hack_twotox(fpu->source);
-}
-
-typedef void (fmath_impl_t)(void);
-#define FMATH_TYPE_DYADIC 1
-#define FMATH_TYPE_68040 2
-~create_fmath_jump_table()
-
-
-/*
- * Take fpu->result, and round and crop it to the
- * preferred precision, then return the result as
- * a floatx80. (Set all the appropriate exception bits
- * too)
- *
- * ALSO!! This checks and sets underflow/overflow
- */
-static floatx80 _fmath_round_intermediate_result ()
-{
-    fpu_get_state_ptr();
-    floatx80 final;
-    
-    float_exception_flags = 0; // (so we can know if the result is inexact)
-    _set_rounding_mode(mc_rnd); // Set the preferred rounding mode
-    
-    if (mc_prec == prec_extended) { // extended precision
-        final = float128_to_floatx80(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-    }
-    else if (mc_prec == prec_double) { // double precision
-        float64 tmp = float128_to_float64(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-        final = float64_to_floatx80(tmp);
-    }
-    else if (mc_prec == prec_single) { // single precision
-        float32 tmp = float128_to_float32(fpu->result);
-        es_inex2 |= ((float_exception_flags & float_flag_inexact) != 0);
-        es_unfl |= ((float_exception_flags & float_flag_underflow) != 0);
-        es_ovfl |= ((float_exception_flags & float_flag_overflow) != 0);
-        final = float32_to_floatx80(tmp);
-    }
-    else
-        assert(!"bogus precision mode???");
-    
-    return final;
-}
-
-static void _fmath_set_condition_codes (floatx80 val)
-{
-    fpu_get_state_ptr();
-    const uint64_t frac = val.low;
-    const uint32_t exp = val.high & 0x7fff;
-    const _Bool sign = val.high >> 15;
-    
-    /* Clear the whole CC register byte */
-    fpu->fpsr.raw &= 0x00ffffff;
-    
-    /* Check for zero */
-    cc_z = ((exp == 0) && (frac == 0));
-    
-    /* Check for negative */
-    cc_n = sign;
-    
-    /* Check for NaN */
-    cc_nan = ((exp == 0x7fff) && ((frac << 1) != 0));
-    
-    /* Check for infinity */
-    cc_i = ((exp == 0x7fff) && ((frac << 1) == 0));
-}
-
-static void _fmath_handle_nans ()
-{
-    fpu_get_state_ptr();
-    
-    const _Bool is_dyadic = _fmath_flags[fpu->fmath_op] & FMATH_TYPE_DYADIC;
-    const _Bool is_signaling = float128_is_signaling_nan(fpu->source) ||
-                                (is_dyadic && float128_is_signaling_nan(fpu->dest));
-    const _Bool is_source_nan = float128_is_nan(fpu->source);
-    const _Bool is_dest_nan = is_dyadic && float128_is_nan(fpu->dest);
-    
-    /*
-     * If the dest is NaN, or both are NaN, let the result be set to dest.
-     * (with signaling disabled)
-     */
-    if (is_dest_nan)
-        fpu->result = fpu->dest;
-    else {
-        assert(is_source_nan);
-        fpu->result = fpu->source;
-    }
-    
-    /* Set the snan exception status bit */
-    es_snan = is_signaling;
-    
-    /* Silence the result */
-    // Signaling -> 0
-    // Non-signaling -> 1
-    fpu->result.high |= 0x800000000000;
-}
-
-void dis_fmath (uint16_t op, uint16_t ext, char *output)
-{
-    ~decompose(op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    /* If this is fmovecr */
-    if (src_in_ea && (source_specifier == 7)) {
-        const char *name = NULL;
-        switch (extension) {
-            case 0x00: name = "pi"; break;
-            case 0x0b: name = "log_10(2)"; break;
-            case 0x0c: name = "c"; break;
-            case 0x0d: name = "log_2(e)"; break;
-            case 0x0e: name = "log_10(e)"; break;
-            case 0x0f: name = "0.0"; break;
-            case 0x30: name = "ln(2)"; break;
-            case 0x31: name = "ln(10)"; break;
-            case 0x32: name = "1.0"; break;
-            case 0x33: name = "10.0"; break;
-            case 0x34: name = "10.0^2"; break;
-            case 0x35: name = "10.0^4"; break;
-            case 0x36: name = "10.0^8"; break;
-            case 0x37: name = "10.0^16"; break;
-            case 0x38: name = "10.0^32"; break;
-            case 0x39: name = "10.0^64"; break;
-            case 0x3a: name = "10.0^128"; break;
-            case 0x3b: name = "10.0^256"; break;
-            case 0x3c: name = "10.0^512"; break;
-            case 0x3d: name = "10.0^1024"; break;
-            case 0x3e: name = "10.0^2048"; break;
-            case 0x3f: name = "10.0^4096"; break;
-        }
-        if (name == NULL)
-            sprintf(output, "fmovecr.x #%u,fp%u", extension, dest_register);
-        else
-            sprintf(output, "fmovecr.x %s,fp%u", name, dest_register);
-        return;
-    }
-    
-    if (_fmath_map[e] == NULL) {
-            /* This instruction isn't defined */
-            sprintf(output, "fmath???");
-    }
-    else if (_fmath_map[e] == inst_fmath_fsincos) {
-        /* fsincos.<fmt> <ea>,FPc:FPs */
-        if (src_in_ea)
-            sprintf(output, "fsincos.%c %s,fp%u:fp%u", "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]), e & 7, dest_register);
-        else
-            sprintf(output, "fsincos.x fp%u,fp%u:fp%u", source_specifier, e & 7, dest_register);
-    }
-    else if (_fmath_map[e] == inst_fmath_ftst) {
-        /* ftst.<fmt> <source> */
-        if (src_in_ea)
-            sprintf(output, "ftst.%c %s", "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]));
-        else
-            sprintf(output, "ftst.x fp%u", dest_register);
-    }
-    else {
-        /* f<inst>.<fmt> <source>,<dest> */
-        if (src_in_ea)
-            sprintf(output, "%s.%c %s,fp%u", _fmath_names[e], "lsxpwdb?"[source_specifier],
-                    decode_ea_rw(M, _format_sizes[source_specifier]), dest_register);
-        else
-            sprintf(output, "%s.x fp%u,fp%u", _fmath_names[e],
-                    source_specifier, dest_register);
-    }
-}
-
-static long double _float128_to_long_double(float128 f128)
-{
-    long double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    
-    int8_t old = float_exception_flags;
-    floatx80 f80 = float128_to_floatx80(f128);
-    float_exception_flags = old;
-    
-    ptr[9] = (f80.high >> 8) & 0xff;
-    ptr[8] = (f80.high >> 0) & 0xff;
-    ptr[7] = (f80.low >> 56) & 0xff;
-    ptr[6] = (f80.low >> 48) & 0xff;
-    ptr[5] = (f80.low >> 40) & 0xff;
-    ptr[4] = (f80.low >> 32) & 0xff;
-    ptr[3] = (f80.low >> 24) & 0xff;
-    ptr[2] = (f80.low >> 16) & 0xff;
-    ptr[1] = (f80.low >> 8) & 0xff;
-    ptr[0] = (f80.low >> 0) & 0xff;
-    
-    return result;
-}
-
-static long double _floatx80_to_long_double(floatx80 f80)
-{
-    long double result;
-    uint8_t *ptr = (uint8_t*)&result;
-    
-    ptr[9] = (f80.high >> 8) & 0xff;
-    ptr[8] = (f80.high >> 0) & 0xff;
-    ptr[7] = (f80.low >> 56) & 0xff;
-    ptr[6] = (f80.low >> 48) & 0xff;
-    ptr[5] = (f80.low >> 40) & 0xff;
-    ptr[4] = (f80.low >> 32) & 0xff;
-    ptr[3] = (f80.low >> 24) & 0xff;
-    ptr[2] = (f80.low >> 16) & 0xff;
-    ptr[1] = (f80.low >> 8) & 0xff;
-    ptr[0] = (f80.low >> 0) & 0xff;
-    
-    return result;
-}
-
-static void inst_fmath (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    floatx80 rounded_result;
-    
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
-    
-    const uint8_t src_in_ea = a;
-    const uint8_t source_specifier = s;
-    const uint8_t dest_register = d;
-    const uint8_t extension = e;
-    
-    slog("FPU:---\n");
-    
-    /* Throw illegal instruction for 040-only ops */
-    if (_fmath_flags[e] & FMATH_TYPE_68040) {
-        _throw_illegal_instruction();
-        return;
-    }
-    
-    /*
-     * All the documented fmath ops have an implementation in
-     * _fmath_map[]. If it's NULL, it's not documented; throw
-     * an exception.
-     * This probably matches what the 68040's behavior (I haven't
-     * checked), but the 68881 doesn't do this.
-     * 68881 throws an illegal instruction exception for all
-     * opcodes where the high (6th) bit of e is set.
-     * All other instructions seem to short circuit to the 
-     * nearest documented instruction.
-     * FIXME: consider implementing this behavior.
-     */
-    if (_fmath_map[e] == NULL) {
-        /* Unless this is fmovecr, where the extension doesn't matter */
-        if (!(src_in_ea && (source_specifier == 7))) {
-            _throw_illegal_instruction();
-            return ;
-        }
-    }
-    
-    /* We only need to load the dest reg for dyadic ops */
-    if (_fmath_flags[e] & FMATH_TYPE_DYADIC)
-        fpu->dest = floatx80_to_float128(fpu->fp[dest_register]);
-    
-    /*
-     * We'll shrink the precision and perform rounding
-     * just prior to writing back the result.
-     * Certain instructions override the precision
-     * in fpcr, so keep track of the prefered prec here.
-     */
-    enum rounding_precision_t rounding_prec = mc_prec;
-    
-    /*
-     * For all the intermediate calculations, we
-     * probably want to use nearest-rounding mode.
-     */
-    _set_rounding_mode(mode_nearest);
-    
-    /* Reset softfloat's exception flags */
-    float_exception_flags = 0;
-    
-    /* Reset fpsr's exception flags */
-    es_inex1 = 0; // this is only set for imprecisely-rounded packed inputs (not implemented)
-    es_inex2 = 0; // set if we ever lose precision (during the op or during rounding)
-    es_dz = 0;    // set if we divided by zero
-    es_unfl = 0;  // set if we underflowed (inex2 should be set too, I think)
-    es_ovfl = 0;  // set if we overflowed (inex2 should be set too, I think)
-    es_operr = 0; // set if there was an instruction specific operand error
-    es_snan = 0;  // Set if one of the inputs was a signaling NaN
-    es_bsun = 0;  // never set here
-    
-    fpu->write_back = 1; // let "do-write-back" be the default behavior
-    fpu->fmath_op = e;
-    
-    /* Handle fmovecr */
-    if (src_in_ea && (source_specifier == 7)) { // fmovecr
-        /* 
-         * 68kprm says M should be ~b(000000), but apparently
-         * any value will work for fmovecr
-         */
-        slog("FPU: fmovecr %u,fp%u\n", e, dest_register);
-        inst_fmath_fmovecr();
-        goto computation_done;
-    }
-    
-    /*
-     * Read in the source from the EA or from a register.
-     * In either case, convert the value to a float128,
-     * (that's our version of the 85-bit "intermediate" format)
-     */
-    if (src_in_ea) {
-        if (!_fpu_read_ea(source_specifier, &fpu->source))
-            return ;
-        slog("FPU: %s.%c ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
-    }
-    else {
-        fpu->source = floatx80_to_float128(fpu->fp[source_specifier]);
-        slog("FPU: %s.x ", _fmath_names[e], "lsxpwdb?"[source_specifier]);
-    }
-
-    
-    {
-        long double tmp = _float128_to_long_double(fpu->source);
-        printf("%Lf,fp%u\n", tmp, dest_register);
-    }
-    
-    /*
-     * If the source is NaN, or this is a dyadic (two-operand)
-     * instruction, and the second operand (fpu->dest) is NaN,
-     * then the result is predetermined: NaN
-     */
-    if (float128_is_nan(fpu->source) ||
-             (((_fmath_flags[e] & FMATH_TYPE_DYADIC) &&
-               float128_is_nan(fpu->dest)))) {
-        _fmath_handle_nans();
-        goto computation_done;
-    }
-    
-    /* 
-     * Otherwise, call the extension-specific helper function.
-     * Guarantees: Neither source nor dest are NaN
-     *             SoftFloat's exception flags have been cleared
-     */
-    _fmath_map[e]();
-    
-    /* 
-     * At this point, the "computation"-phase (I forget what the correct
-     * 6888x term is) is over. Now we check exception bits, throw exceptions,
-     * compute condition codes, and round and store the result.
-     */
-computation_done:
-    
-    /* Convert the 128-bit result to the specified precision */
-    /*
-     * FIXME: If fpu->write_back==0, should we still go through rounding?
-     *        The condition codes will still need to be set. Should they
-     *        be set based on the intermediate result or rounded result?
-     */
-    rounded_result = _fmath_round_intermediate_result();
-    
-    
-    /* Update the accrued exception bits */
-    
-    assert(!es_bsun); // no fmath op can throw es_bsun
-    
-    ae_iop |= es_bsun | es_snan | es_operr;
-    ae_ovfl |= es_ovfl;
-    ae_unfl |= (es_unfl & es_inex2); // yes, &
-    ae_dz |= es_dz;
-    ae_inex |= es_inex1 | es_inex2 | es_ovfl;
-    
-    slog("FPU: bsun=%u snan=%u operr=%u ovfl=%u unfl=%u dz=%u inex1=%u inex2=%u\n",
-           es_bsun, es_snan, es_operr, es_ovfl, es_unfl, es_dz, es_inex1, es_inex2);
-    
-    /* Are any exceptions both set and enabled? */
-    if (fpu->fpsr.raw & fpu->fpcr.raw & 0x0000ff00) {
-        /* 
-         * Then we need to throw an exception.
-         * The exception is sent to the vector for
-         * the highest priority exception, and the priority
-         * order is (high->low) bsan, snan, operr, ovfl, unfl, dz, inex2/1
-         * (which is the order of the bits in fpsr/fpcr).
-         * Iterate over the bits in order, and throw the
-         * exception to whichever bit is set first.
-         */
-        uint8_t i, throwable = (fpu->fpsr.raw & fpu->fpcr.raw) >> 8;
-        
-        slog("FPU: throw exception! 0x%08x\n", throwable);
-        
-        assert(throwable);
-        for (i=0; 1; i++) {
-            if (throwable & 0x80)
-                break;
-            throwable <<= 1;
-        }
-        
-        /*
-         * Convert the exception bit position
-         * to the correct vector number, and throw
-         * a (pre-instruction) exception.
-         */
-        throw_fpu_pre_instruction_exception(_exception_bit_to_vector[i]);
-        
-        return ;
-    }
-    
-    /*
-     * Otherwise, no exceptions to throw!
-     * Calculate the condition codes from the result.
-     */
-    _fmath_set_condition_codes(rounded_result);
-    
-    /*
-     * We're definitely running to completion now,
-     * so commit ea-read changes
-     */
-    _fpu_read_ea_commit(source_specifier);
-    
-    /* Write back the result, and we're done! */
-    if (fpu->write_back) {
-        fpu->fp[dest_register] = rounded_result;
-        
-        long double tmp = _floatx80_to_long_double(rounded_result);
-        slog("FPU: result = %Lf\n", tmp);
-    }
-}
-
-#pragma mark Second-hop non-fmath instructions
-
-/*
- * reg->mem fmove (fmath handles all other fmoves
- */
-static void inst_fmove (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    ~decompose(shoe.op, 1111 001 000 mmmrrr);
-    ~decompose(ext, 011 fff sss KKKKKKK);
-    
-    _fpu_write_ea(M, f, &fpu->fp[s], K);
-}
-
-static void inst_fmovem_control (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op,  1111 001 000 mmmrrr);
-    ~decompose(shoe.op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 10d CSI 0000000000);
-    
-    const uint32_t count = C + S + I;
-    const uint32_t size = count * 4;
-    uint32_t addr, buf[3];
-    uint32_t i;
-    
-    /* I don't know if this is even a valid instruction */
-    if (count == 0)
-        return ;
-    
-    /* data and addr reg modes are valid, but only if count==1 */
-    if ((m == 0 || m == 1) && (count > 1)) {
-        _throw_illegal_instruction();
-        return ;
-    }
-    
-    if (d) { // reg to memory
-        i=0;
-        if (C) buf[i++] = fpu->fpcr.raw;
-        if (S) buf[i++] = fpu->fpsr.raw;
-        if (I) buf[i++] = fpu->fpiar;
-        
-        if (m == 0) {
-            if (count == 1)
-                shoe.d[r] = buf[0];
-            else
-                _throw_illegal_instruction();
-            return ;
-        }
-        else if (m == 1) {
-            if ((count == 1) && I)
-                shoe.a[r] = buf[0];
-            else
-                _throw_illegal_instruction();
-            return ;
-        }
-        else if (m == 3)
-            addr = shoe.a[r];
-        else if (m == 4)
-            addr = shoe.a[r] - size;
-        else {
-            if ((m==7) && (r!=0 || r!=1)) {
-                /* Not allowed for reg->mem */
-                _throw_illegal_instruction();
-                return;
-            }
-            call_ea_addr(M);
-            addr = shoe.dat;
-        }
-        
-        for (i=0; i<count; i++) {
-            lset(addr + (i*4), 4, buf[i]);
-            if (shoe.abort)
-                return ;
-        }
-    }
-    else { // mem to reg
-        if (m == 0) {// data reg
-            if (count == 1)
-                buf[0] = shoe.d[r];
-            else
-                _throw_illegal_instruction();
-            return;
-        }
-        else if (m == 1) {// addr reg
-            if ((count == 1) && I)
-                buf[0] = shoe.a[r];
-            else
-                _throw_illegal_instruction();
-            return;
-        }
-        else {
-            if (m == 3) // post-increment
-                addr = shoe.a[r];
-            else if (m == 4) // pre-decrement
-                addr = shoe.a[r] - size;
-            else if (M == 0x3c) // immediate
-                addr = shoe.pc;
-            else {
-                call_ea_addr(M); // call_ea_addr() should work for all other modes
-                addr = shoe.dat;
-            }
-            
-            for (i=0; i<count; i++) {
-                buf[i] = lget(addr + (i*4), 4);
-                if (shoe.abort)
-                    return ;
-            }
-        }
-        
-        i = 0;
-        
-        if (C) {
-            uint8_t round = fpu->fpcr.b._mc_rnd;
-            fpu->fpcr.raw = buf[i++];
-            uint8_t newround = fpu->fpcr.b._mc_rnd;
-            
-            if (round != newround) {
-                slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
-            }
-        }
-        if (S) fpu->fpsr.raw = buf[i++];
-        if (I) fpu->fpiar = buf[i++];
-        
-        // Commit immediate-EA-mode PC change
-        if (M == 0x3c)
-            shoe.pc += size;
-    }
-    
-    // Commit pre/post-inc/decrement
-    
-    if (m == 3)
-        shoe.a[r] += size;
-    if (m == 4)
-        shoe.a[r] -= size;
-    
-    
-    
-    slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
-    
-    
-}
-
-static void inst_fmovem (const uint16_t ext)
-{
-    fpu_get_state_ptr();
-    
-    ~decompose(shoe.op,  1111 001 000 mmmrrr);
-    ~decompose(shoe.op,  1111 001 000 MMMMMM);
-    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
-    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
-    
-    const uint8_t pre_mask = s ? shoe.d[y] : L; // pre-adjusted mask
-    
-    // Count the number of bits in the mask
-    uint32_t count, maskcpy = pre_mask;
-    for (count=0; maskcpy; maskcpy >>= 1)
-        count += (maskcpy & 1);
-    
-    const uint32_t size = count * 12;
-    
-    // for predecrement mode, the mask is reversed
-    uint8_t mask = 0;
-    if (m == 4) {
-        uint32_t i;
-        for (i=0; i < 8; i++) {
-            const uint8_t bit = (pre_mask << i) & 0x80;
-            mask = (mask >> 1) | bit;
-        }
-    }
-    else
-        mask = pre_mask;
-    
-    uint32_t i, addr;
-    
-    // Find the EA
-    if (m == 3) {
-        addr = shoe.a[r];
-        assert(p); // assert post-increment mask
-    }
-    else if (m == 4) {
-        addr = shoe.a[r] - size;
-        assert(!p); // assert pre-decrement mask
-    }
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-        assert(p); // assert post-increment mask
-    }
-    
-    slog("inst_fmovem: pre=%08x mask=%08x EA=%u/%u addr=0x%08x size=%u %s\n", pre_mask, mask, m, r, addr, size, d?"to mem":"from mem");
-    
-    if (d) {
-        // Write those registers
-        for (i=0; i<8; i++) {
-            if (!(mask & (0x80 >> i)))
-                continue;
-            
-            uint8_t buf[12];
-            _floatx80_to_extended(&fpu->fp[i], buf);
-            
-            // slog("inst_fmovem: writing %Lf from fp%u", fpu->fp[i], i);
-            uint32_t j;
-            for (j=0; j<12; j++) {
-                slog(" %02x", buf[j]);
-                lset(addr, 1, buf[j]);
-                addr++;
-                if (shoe.abort)
-                    return ;
-            }
-            slog("\n");
-        }
-    }
-    else {
-        // Read those registers
-        for (i=0; i<8; i++) {
-            if (!(mask & (0x80 >> i)))
-                continue;
-            
-            uint8_t buf[12];
-            uint32_t j;
-            for (j=0; j<12; j++) {
-                buf[j] = lget(addr, 1);
-                addr++;
-                if (shoe.abort)
-                    return ;
-            }
-            _extended_to_floatx80(buf, &fpu->fp[i]);
-            
-            // slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
-        }
-    }
-    
-    // Commit the write for pre/post-inc/decrement
-    if (m == 3)
-        shoe.a[r] += size;
-    else if (m == 4)
-        shoe.a[r] -= size;
-}
-
-
-#pragma mark First-hop decoder table inst implementations
-/* 
- * The table generated by decoder_gen.c will refer directly
- * to these instructions. inst_fpu_other() will handle all
- * other FPU instructions.
- */
-
-static _Bool fpu_test_cc(uint8_t cc)
-{
-    fpu_get_state_ptr();
-    const _Bool z = cc_z;
-    const _Bool n = cc_n;
-    const _Bool nan = cc_nan;
-    
-    switch (cc & 0x0f) {
-        case 0: // false
-            return 0;
-        case 1: // equal
-            return z;
-        case 2: // greater than
-            return !(nan | z | n);
-        case 3: // greater than or equal
-            return z | !(nan | n);
-        case 4: // less than
-            return n & !(nan | z);
-        case 5: // less than or equal
-            return z | (n & !nan);
-        case 6: // greater or less than
-            return !(nan | z);
-        case 7: // ordered
-            return !nan;
-        case 8: // unordered
-            return nan;
-        case 9: // not (greater or less than)
-            return nan | z;
-        case 10: // not (less than or equal)
-            return nan | !(n | z);
-        case 11: // not (less than)
-            return nan | (z | !n);
-        case 12: // not (greater than or equal)
-            return nan | (n & !z);
-        case 13: // not (greater than)
-            return nan | z | n;
-        case 14: // not equal
-            return !z;
-        case 15: // true
-            return 1;
-    }
-    
-    assert(0);
-    return 0;
-}
-
-void inst_fscc () {
-    fpu_get_state_ptr();
-    
-    // fscc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    const uint16_t ext = nextword();
-    
-    ~decompose(shoe.op, 1111 001 001 MMMMMM);
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    shoe.dat = fpu_test_cc(c) ? 0xff : 0;
-    
-    call_ea_write(M, 1);
-}
-
-void inst_fbcc () {
-    fpu_get_state_ptr();
-    
-    // fbcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    ~decompose(shoe.op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
-    const uint8_t sz = 2 << s;
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception 
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c)) {
-        const uint16_t ext = nextword();
-        uint32_t displacement;
-    
-        if (s) {
-            const uint16_t ext2 = nextword();
-            displacement = (ext << 16) | ext2;
-        }
-        else
-            displacement = (int16_t)ext;
-        
-        shoe.pc = shoe.orig_pc + 2 + displacement;
-    }
-    else
-        shoe.pc += sz;
-}
-
-void inst_fsave () {
-    fpu_get_state_ptr();
-    verify_supervisor();
-    
-    // Don't modify fpiar for fsave
-    
-    ~decompose(shoe.op, 1111 001 100 MMMMMM);
-    ~decompose(shoe.op, 1111 001 100 mmmrrr);
-    
-    const uint32_t size = 0x1c; // IDLE frame
-    const uint16_t frame_header = 0xfd18;
-    uint32_t addr;
-    
-    if (m == 4)
-        addr = shoe.a[r] - size;
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    lset(addr, 2, frame_header);
-    if (shoe.abort)
-        return ;
-    
-    if (m == 4)
-        shoe.a[r] = addr;
-    
-}
-
-void inst_frestore () {
-    fpu_get_state_ptr();
-    verify_supervisor();
-    
-    // Don't modify fpiar for frestore
-    
-    ~decompose(shoe.op, 1111 001 101 MMMMMM);
-    ~decompose(shoe.op, 1111 001 101 mmmrrr);
-    
-    uint32_t addr, size;
-    
-    if (m == 3)
-        addr = shoe.a[r];
-    else {
-        call_ea_addr(M);
-        addr = shoe.dat;
-    }
-    
-    const uint16_t word = lget(addr, 2);
-    if (shoe.abort) return ;
-    
-    // XXX: These frame sizes are different on 68881/68882/68040
-    if ((word & 0xff00) == 0x0000)
-        size = 4; // NULL state frame
-    else if ((word & 0xff) == 0x0018)
-        size = 0x1c; // IDLE state frame
-    else if ((word & 0xff) == 0x00b4)
-        size = 0xb8; // BUSY state frame
-    else {
-        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
-        assert(!"inst_frestore: bad state frame");
-        return ;
-    }
-    
-    if (m==3) {
-        shoe.a[r] += size;
-        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
-    }
-}
-
-void inst_fdbcc () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 001 001 rrr);
-    
-    // fdbcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c)) {
-        shoe.pc += 2;
-    }
-    else {
-        const int16_t disp = nextword();
-        const uint16_t newd = get_d(r, 2) - 1;
-        set_d(r, newd, 2);
-        if (newd != 0xffff)
-            shoe.pc = shoe.orig_pc + 2 + disp;
-    }
-}
-
-void inst_ftrapcc () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 001 111 xyz);
-    
-    // ftrapcc can throw an exception
-    fpu->fpiar = shoe.orig_pc;
-    
-    // (xyz) == (100) -> sz=0
-    // (xyz) == (010) -> sz=2
-    // (xyz) == (011) -> sz=4
-    const uint32_t sz = y << (z+1);
-    const uint32_t next_pc = shoe.orig_pc + 2 + sz;
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, 0000 0000 000 b cccc);
-    
-    /*
-     * inst_f*cc instructions throw a pre-instruction exception
-     * if b && cc_nan
-     */
-    if (b && _bsun_test())
-        return ;
-    
-    if (fpu_test_cc(c))
-        throw_frame_two(shoe.sr, next_pc, 7, shoe.orig_pc);
-    else
-        shoe.pc = next_pc;
-}
-
-void inst_fnop() {
-    // This is technically fbcc
-    inst_fbcc();
-}
-
-void inst_fpu_other () {
-    fpu_get_state_ptr();
-    ~decompose(shoe.op, 1111 001 000 MMMMMM);
-    
-    const uint16_t ext = nextword();
-    ~decompose(ext, ccc xxx yyy eeeeeee);
-    
-    switch (c) {
-        case 0: // Reg to reg
-            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
-            inst_fmath(ext);
-            return;
-            
-        case 1: // unused
-            _throw_illegal_instruction();
-            return;
-            
-        case 2: // Memory->reg & movec
-            fpu->fpiar = shoe.orig_pc; // fmath() can throw an exception
-            inst_fmath(ext);
-            return;
-            
-        case 3: // reg->mem
-            fpu->fpiar = shoe.orig_pc; // fmove() can throw an exception
-            inst_fmove(ext);
-            return;
-            
-        case 4: // mem -> sys ctl registers
-        case 5: // sys ctl registers -> mem
-            // fmovem_control() cannot throw an FPU exception (don't modify fpiar)
-            inst_fmovem_control(ext);
-            return;
-            
-        case 6: // movem to fp registers
-        case 7: // movem to memory
-            // fmovem() cannot throw an FPU exception (don't modify fpiar)
-            inst_fmovem(ext);
-            return;
-    }
-    
-    assert(0); // never get here
-    return;
-}
-
-#pragma mark FPU-state initialization and reset
-
-void fpu_initialize()
-{
-    fpu_state_t *fpu = (fpu_state_t*)p_alloc(shoe.pool, sizeof(fpu_state_t));
-    memset(fpu, sizeof(fpu_state_t), 0);
-    shoe.fpu_state = fpu;
-}
-
-void fpu_reset()
-{
-    p_free(shoe.fpu_state);
-    fpu_initialize();
-}
diff --git a/core/oldfpu.c b/core/oldfpu.c
new file mode 100644
index 0000000..03b4e73
--- /dev/null
+++ b/core/oldfpu.c
@@ -0,0 +1,1539 @@
+/*
+ * Copyright (c) 2013, Peter Rutenbar <pruten@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdio.h>
+#include <assert.h>
+#include <fenv.h>
+#include <float.h>
+#include <math.h>
+#include "../core/shoebill.h"
+
+extern struct dis_t dis;
+extern uint16_t dis_op;
+
+#define FPU_JUMP_EMU 0
+#define FPU_JUMP_DIS 1
+typedef void (fpu_func_t)(uint16_t, uint16_t);
+
+typedef struct {
+    fpu_func_t *emu, *dis;
+    const char *name;
+} fpu_inst_t;
+
+~newmacro(create_fpu_jump_table, 0, {
+    my $names = [
+        'unknown', 'fabs', 'facos', 'fadd', 'fasin', 'fatan', 'fatanh', 'fbcc', 'fcmp', 'fcos', 'fcosh',
+        'fdbcc', 'fdiv', 'fetox', 'fetoxm1', 'fgetexp', 'fgetman', 'fint', 'fintrz', 'flog10', 'flog2',
+        'flogn', 'flognp1', 'fmod', 'fmove', 'fmovecr', 'fmovem', 'fmovem_control', 'fmul', 'fneg', 'fnop',
+        'frem', 'frestore', 'fsave', 'fscale', 'fscc', 'fsgldiv', 'fsglmul', 'fsin', 'fsincos', 'fsinh',
+        'fsqrt', 'fsub', 'ftan', 'ftanh', 'ftentox', 'ftrapcc', 'ftst', 'ftwotox'
+    ];
+    my $fpu_enum = "typedef enum {\n";
+    foreach my $n (@$names) {
+        $fpu_enum .= "\tfpu_inst_$n,\n";
+    }
+    $fpu_enum .= "\nfpu_inst_max} fpu_inst_name_t;";
+    
+    my $fpu_table = "fpu_inst_t fpu_inst_table[fpu_inst_max] = {\n";
+    foreach my $n (@$names) {
+        $fpu_table .= "\t{NULL, NULL, \"" . $n . "\"},\n";
+    }
+    $fpu_table = substr($fpu_table, 0, -2);
+    $fpu_table .= "\n};";
+    
+    my $out = "$fpu_enum \n $fpu_table \n";
+    return $out;
+})
+
+~create_fpu_jump_table()
+
+static fpu_inst_name_t fpu_decode_op(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 ttt MMMMMM);
+    
+    if (t) {
+        switch (t) {
+            case 1:
+                if ((M>>3) == 1)
+                    return fpu_inst_fdbcc;
+                else if ((M>>3) == 7)
+                    return fpu_inst_ftrapcc;
+                return fpu_inst_fscc;
+            case 2:
+                if (M==0 && ext == 0)
+                    return fpu_inst_fnop; // same as fbf.w
+                // fall through
+            case 3:
+                return fpu_inst_fbcc;
+            case 4:
+                return fpu_inst_fsave;
+            case 5:
+                return fpu_inst_frestore;
+        }
+        return fpu_inst_unknown;
+    }
+    
+    ~decompose(ext, ccc xxx yyy eeeeeee)
+    
+    switch (c) {
+        case 0: // Reg to reg
+            break;
+        case 1: // unused
+            return fpu_inst_unknown;
+        case 2: // Memory->reg & movec
+            break;
+            
+        case 3: // reg->mem
+            return fpu_inst_fmove;
+            
+        case 4: // mem -> sys ctl registers
+        case 5: // sys ctl registers -> mem
+            return fpu_inst_fmovem_control;
+            
+        case 6: // movem to fp registers
+        case 7: // movem to memory
+            return fpu_inst_fmovem;
+    }
+    
+    // Here c == 0b000 or 010
+    
+    if (M == 0 && ~bmatch(ext, 010 111 xxx xxxxxxx))
+        return fpu_inst_fmovecr;
+    
+    if ((e>>3) == ~b(0110))
+        return fpu_inst_fsincos;
+    
+    switch (e) {
+            
+        case ~b(0000000):
+        case ~b(1000000):
+        case ~b(1000100):
+            return fpu_inst_fmove;
+            
+        case ~b(0000001): return fpu_inst_fint;
+        case ~b(0000010): return fpu_inst_fsinh;
+        case ~b(0000011): return fpu_inst_fintrz;
+        case ~b(0000110): return fpu_inst_flognp1;
+        case ~b(0001000): return fpu_inst_fetoxm1;
+        case ~b(0001001): return fpu_inst_ftanh;
+        case ~b(0001010): return fpu_inst_fatan;
+        case ~b(0001100): return fpu_inst_fasin;
+        case ~b(0001101): return fpu_inst_fatanh;
+        case ~b(0001110): return fpu_inst_fsin;
+        case ~b(0001111): return fpu_inst_ftan;
+        case ~b(0010000): return fpu_inst_fetox;
+        case ~b(0010001): return fpu_inst_ftwotox;
+        case ~b(0010010): return fpu_inst_ftentox;
+        case ~b(0010100): return fpu_inst_flogn;
+        case ~b(0010101): return fpu_inst_flog10;
+        case ~b(0010110): return fpu_inst_flog2;
+        case ~b(0011001): return fpu_inst_fcosh;
+        case ~b(0011100): return fpu_inst_facos;
+        case ~b(0011101): return fpu_inst_fcos;
+        case ~b(0011110): return fpu_inst_fgetexp;
+        case ~b(0011111): return fpu_inst_fgetman;
+        case ~b(0100001): return fpu_inst_fmod;
+        case ~b(0100100): return fpu_inst_fsgldiv;
+        case ~b(0100111): return fpu_inst_fsglmul;
+        case ~b(0100101): return fpu_inst_frem;
+        case ~b(0100110): return fpu_inst_fscale;
+        case ~b(0111000): return fpu_inst_fcmp;
+        case ~b(0111010): return fpu_inst_ftst;
+            
+        case ~b(0011000):
+        case ~b(1011000):
+        case ~b(1011100):
+            return fpu_inst_fabs;
+            
+        case ~b(0100010):
+        case ~b(1100010):
+        case ~b(1100110):
+            return fpu_inst_fadd;
+            
+        case ~b(0100000):
+        case ~b(1100000):
+        case ~b(1100100):
+            return fpu_inst_fdiv;
+            
+            
+        case ~b(0100011):
+        case ~b(1100011):
+        case ~b(1100111):
+            return fpu_inst_fmul;
+            
+        case ~b(0011010):
+        case ~b(1011010):
+        case ~b(1011110):
+            return fpu_inst_fneg;
+            
+        case ~b(0000100):
+        case ~b(1000001):
+        case ~b(1000101):
+            return fpu_inst_fsqrt;
+            
+        case ~b(0101000):
+        case ~b(1101000):
+        case ~b(1101100):
+            return fpu_inst_fsub;
+    }
+    
+    return fpu_inst_unknown;
+    
+}
+
+#define nextword() ({const uint16_t w=lget(shoe.pc,2); if (shoe.abort) {return;}; shoe.pc+=2; w;})
+#define verify_supervisor() {if (!sr_s()) {throw_privilege_violation(); return;}}
+
+void dis_fpu_decode ()
+{
+    ~decompose(dis_op, 1111 001 xxx 000000);
+    
+    fpu_inst_name_t name;
+    uint16_t ext = 0;
+    
+    if (x == 4)
+        name = fpu_inst_fsave;
+    else if (x == 5)
+        name = fpu_inst_frestore;
+    else {
+        ext = dis_next_word();
+        name = fpu_decode_op(dis_op, ext);
+    }
+    
+    if (fpu_inst_table[name].dis) {
+        (*fpu_inst_table[name].dis)(dis_op, ext);
+        return ;
+    }
+    
+    sprintf(dis.str, "%s ???", fpu_inst_table[name].name);
+}
+
+void inst_fpu_decode ()
+{
+    ~decompose(shoe.op, 1111 001 xxx 000000);
+    
+    fpu_inst_name_t name;
+    uint16_t ext = 0;
+    
+    if (x == 4)
+        name = fpu_inst_fsave;
+    else if (x == 5)
+        name = fpu_inst_frestore;
+    else {
+        ext = nextword();
+        name = fpu_decode_op(shoe.op, ext);
+        // "For FPCP instructions that generate FPU exceptions,
+        //  FPIAR is loaded with the address of an instruction before it's executed,
+        //  unless all arithmetic exceptions are disabled."
+        // My take: set fpiar for all instructions except fsave, frestore, and fmovem_control
+        if (name != fpu_inst_fmovem_control)
+            shoe.fpiar = shoe.orig_pc;
+    }
+    
+    if (fpu_inst_table[name].emu) {
+        (*fpu_inst_table[name].emu)(shoe.op, ext);
+        return ;
+    }
+    
+    slog("inst_fpu_decode: unhandled instruction: %s op=0x%04x ext = 0x%04x pc=0x%08x\n", fpu_inst_table[name].name, shoe.op, ext, shoe.orig_pc);
+    assert(!"unknown fpu inst");
+    //dbg_state.running = 0;
+    
+}
+
+
+void dis_fsave(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 100 MMMMMM);
+    ~decompose(op, 1111 001 100 mmmrrr);
+    
+    if (m == 4)
+        sprintf(dis.str, "fsave -(a%u)", r);
+    else
+        sprintf(dis.str, "fsave %s", decode_ea_addr(M));
+}
+
+void inst_fsave(uint16_t op, uint16_t ext)
+{
+    verify_supervisor();
+    
+    ~decompose(op, 1111 001 100 MMMMMM);
+    ~decompose(op, 1111 001 100 mmmrrr);
+    
+    const uint32_t size = 0x1c; // IDLE frame
+    const uint16_t frame_header = 0xfd18;
+    uint32_t addr;
+    
+    if (m == 4)
+        addr = shoe.a[r] - size;
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    lset(addr, 2, frame_header);
+    if (shoe.abort)
+        return ;
+    
+    if (m == 4)
+        shoe.a[r] = addr;
+}
+
+void dis_frestore(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 101 MMMMMM);
+    ~decompose(op, 1111 001 101 mmmrrr);
+    
+    if (m == 3)
+        sprintf(dis.str, "frestore (a%u)+", r);
+    else
+        sprintf(dis.str, "frestore %s", decode_ea_addr(M));
+}
+
+void inst_frestore(uint16_t op, uint16_t ext)
+{
+    verify_supervisor();
+    
+    ~decompose(op, 1111 001 101 MMMMMM);
+    ~decompose(op, 1111 001 101 mmmrrr);
+    
+    uint32_t addr, size;
+    
+    if (m == 3)
+        addr = shoe.a[r];
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+    }
+    
+    const uint16_t word = lget(addr, 2);
+    if (shoe.abort) return ;
+    
+    // XXX: These frame sizes are different on 68881/68882/68040
+    if ((word & 0xff00) == 0x0000)
+        size = 4; // NULL state frame
+    else if ((word & 0xff) == 0x0018)
+        size = 0x1c; // IDLE state frame
+    else if ((word & 0xff) == 0x00b4)
+        size = 0xb8; // BUSY state frame
+    else {
+        slog("Frestore encountered an unknown state frame 0x%04x\n", word);
+        assert("inst_frestore: bad state frame");
+        return ;
+    }
+    
+    if (m==3) {
+        shoe.a[r] += size;
+        slog("frestore: changing shoe.a[%u] += %u\n", r, size);
+    }
+}
+
+typedef struct {
+    uint8_t inexact;
+    uint8_t dat[4][12];
+} fmovecr_t;
+
+fmovecr_t fmovecr_pi = {1, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x34, 0x40, 0x00, 0x00, 0x00, 0xc9, 0x0f, 0xda, 0xa2, 0x21, 0x68, 0xc2, 0x35, };
+fmovecr_t fmovecr_log10_2 = {1, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x98, 0x3f, 0xfd, 0x00, 0x00, 0x9a, 0x20, 0x9a, 0x84, 0xfb, 0xcf, 0xf7, 0x99, };
+fmovecr_t fmovecr_e = {1, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9a, 0x40, 0x00, 0x00, 0x00, 0xad, 0xf8, 0x54, 0x58, 0xa2, 0xbb, 0x4a, 0x9b, };
+fmovecr_t fmovecr_log2_e = {1, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbb, 0x3f, 0xff, 0x00, 0x00, 0xb8, 0xaa, 0x3b, 0x29, 0x5c, 0x17, 0xf0, 0xbc, };
+fmovecr_t fmovecr_log10_e = {0, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, 0x3f, 0xfd, 0x00, 0x00, 0xde, 0x5b, 0xd8, 0xa9, 0x37, 0x28, 0x71, 0x95, };
+fmovecr_t fmovecr_zero = {0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_ln_2 = {1, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xab, 0x3f, 0xfe, 0x00, 0x00, 0xb1, 0x72, 0x17, 0xf7, 0xd1, 0xcf, 0x79, 0xac, };
+fmovecr_t fmovecr_ln_10 = {1, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x16, 0x40, 0x00, 0x00, 0x00, 0x93, 0x5d, 0x8d, 0xdd, 0xaa, 0xa8, 0xac, 0x17, };
+fmovecr_t fmovecr_10_0 = {0, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3f, 0xff, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_1 = {0, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_2 = {0, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_4 = {0, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x0c, 0x00, 0x00, 0x9c, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_8 = {0, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x19, 0x00, 0x00, 0xbe, 0xbc, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_16 = {0, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, 0x40, 0x34, 0x00, 0x00, 0x8e, 0x1b, 0xc9, 0xbf, 0x04, 0x00, 0x00, 0x00, };
+fmovecr_t fmovecr_10_32 = {1, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9d, 0x40, 0x69, 0x00, 0x00, 0x9d, 0xc5, 0xad, 0xa8, 0x2b, 0x70, 0xb5, 0x9e, };
+fmovecr_t fmovecr_10_64 = {1, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd5, 0x40, 0xd3, 0x00, 0x00, 0xc2, 0x78, 0x1f, 0x49, 0xff, 0xcf, 0xa6, 0xd6, };
+fmovecr_t fmovecr_10_128 = {1, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xdf, 0x41, 0xa8, 0x00, 0x00, 0x93, 0xba, 0x47, 0xc9, 0x80, 0xe9, 0x8c, 0xe0, };
+fmovecr_t fmovecr_10_256 = {1, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8d, 0x43, 0x51, 0x00, 0x00, 0xaa, 0x7e, 0xeb, 0xfb, 0x9d, 0xf9, 0xde, 0x8e, };
+fmovecr_t fmovecr_10_512 = {1, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc6, 0x46, 0xa3, 0x00, 0x00, 0xe3, 0x19, 0xa0, 0xae, 0xa6, 0x0e, 0x91, 0xc7, };
+fmovecr_t fmovecr_10_1024 = {1, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x17, 0x4d, 0x48, 0x00, 0x00, 0xc9, 0x76, 0x75, 0x86, 0x81, 0x75, 0x0c, 0x18, };
+fmovecr_t fmovecr_10_2048 = {1, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe5, 0x5a, 0x92, 0x00, 0x00, 0x9e, 0x8b, 0x3b, 0x5d, 0xc5, 0x3d, 0x5d, 0xe6, };
+fmovecr_t fmovecr_10_4096 = {1, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9a, 0x75, 0x25, 0x00, 0x00, 0xc4, 0x60, 0x52, 0x02, 0x8a, 0x20, 0x97, 0x9b, };
+
+const int _fpu_round_map[4] = {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
+#define fpu_set_round() assert(0 == fesetround(_fpu_round_map[shoe.fpcr.b.mc_rnd]))
+#define fpu_reset_round() assert(0 == fesetround(FE_TONEAREST))
+
+static void fpu_set_cc(long double f)
+{
+    // Set condition codes
+    shoe.fpsr.raw &= 0x00ffffff;
+    shoe.fpsr.b.cc_nan = (0 != isnan(f));
+    if (!shoe.fpsr.b.cc_nan) {
+        shoe.fpsr.b.cc_n = (0 != signbit(f));
+        if (isinf(f))
+            shoe.fpsr.b.cc_i = 1;
+        else
+            shoe.fpsr.b.cc_z = (f == 0.0);
+    }
+}
+
+static long double fpu_set_reg(long double f, uint8_t r)
+{
+    // Round the number according to the mode control byte
+    {
+        fpu_set_round();
+        
+        if (shoe.fpcr.b.mc_prec == 1) {
+            const float tmp = (float)f;
+            f = tmp;
+        } else if (shoe.fpcr.b.mc_prec == 2) {
+            const double tmp = (double)f;
+            f = tmp;
+        }
+        
+        fpu_reset_round();
+    }
+    
+    // Store it
+    shoe.fp[r] = f;
+    return f;
+}
+
+// fpu_set_reg_cc() and fpu_set_ea() set the condition codes. (what else should they set?)
+static void fpu_set_reg_cc(long double f, uint8_t r)
+{
+    fpu_set_cc(fpu_set_reg(f, r));
+}
+
+static void x87_to_motorola(long double x87, uint8_t motorola[12])
+{
+    uint8_t *x87_ptr = (uint8_t*)&x87;
+    motorola[0] = x87_ptr[9];
+    motorola[1] = x87_ptr[8];
+    motorola[2] = 0;
+    motorola[3] = 0;
+    motorola[4] = x87_ptr[7];
+    motorola[5] = x87_ptr[6];
+    motorola[6] = x87_ptr[5];
+    motorola[7] = x87_ptr[4];
+    motorola[8] = x87_ptr[3];
+    motorola[9] = x87_ptr[2];
+    motorola[10] = x87_ptr[1];
+    motorola[11] = x87_ptr[0];
+}
+
+static long double motorola_to_x87(const uint8_t motorola[12])
+{
+    uint8_t x87[12];
+    
+    x87[11] = 0;
+    x87[10] = 0;
+    x87[9] = motorola[0];
+    x87[8] = motorola[1];
+    
+    x87[7] = motorola[4];
+    x87[6] = motorola[5];
+    x87[5] = motorola[6];
+    x87[4] = motorola[7];
+    x87[3] = motorola[8];
+    x87[2] = motorola[9];
+    x87[1] = motorola[10];
+    x87[0] = motorola[11];
+    return *(long double*)&x87[0];
+}
+
+void inst_fmovecr(uint16_t op, uint16_t ext)
+{
+    ~decompose(ext, 010111 rrr xxxxxxx);
+    
+    fmovecr_t *c = &fmovecr_zero;
+    
+    switch (x) {
+        case 0x00: c = &fmovecr_pi; break;
+        case 0x0b: c = &fmovecr_log10_2; break;
+        case 0x0c: c = &fmovecr_e; break;
+        case 0x0d: c = &fmovecr_log2_e; break;
+        case 0x0e: c = &fmovecr_log10_e; break;
+        case 0x0f: c = &fmovecr_zero; break;
+        case 0x30: c = &fmovecr_ln_2; break;
+        case 0x31: c = &fmovecr_ln_10; break;
+        case 0x32: c = &fmovecr_10_0; break;
+        case 0x33: c = &fmovecr_10_1; break;
+        case 0x34: c = &fmovecr_10_2; break;
+        case 0x35: c = &fmovecr_10_4; break;
+        case 0x36: c = &fmovecr_10_8; break;
+        case 0x37: c = &fmovecr_10_16; break;
+        case 0x38: c = &fmovecr_10_32; break;
+        case 0x39: c = &fmovecr_10_64; break;
+        case 0x3a: c = &fmovecr_10_128; break;
+        case 0x3b: c = &fmovecr_10_256; break;
+        case 0x3c: c = &fmovecr_10_512; break;
+        case 0x3d: c = &fmovecr_10_1024; break;
+        case 0x3e: c = &fmovecr_10_2048; break;
+        case 0x3f: c = &fmovecr_10_4096; break;
+    }
+    
+    // The constants in the 68881's ROM must be in the "intermediate" format, because they're rounded differently based on fpcr.rnd
+    const long double f = motorola_to_x87(c->dat[shoe.fpcr.b.mc_rnd]);
+    
+    fpu_set_reg_cc(f, r);
+    
+    slog("inst_fmovecr: set fp%u=%.30Lg\n", r, shoe.fp[r]);
+    
+    // fpu_finalize_exceptions();
+}
+
+void dis_fmovecr(uint16_t op, uint16_t ext)
+{
+    ~decompose(ext, 010111 rrr xxxxxxx);
+    
+    sprintf(dis.str, "fmovecr.x 0x%02x,fp%u", x, r);
+}
+
+void inst_fmovem_control(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 10d CSI 0000000000);
+    
+    const uint32_t count = C + S + I;
+    const uint32_t size = count * 4;
+    
+    if (count == 0) // I don't know if this is even a valid instruction
+        return ;
+    
+    if ((m == 0 || m == 1) && (count > 1)) { // data and addr reg modes are valid, but only if count==1
+        throw_illegal_instruction();
+        return ;
+    }
+    
+    uint32_t addr, buf[3];
+    uint32_t i;
+    
+    if (d) { // reg to memory
+        i=0;
+        if (C) buf[i++] = shoe.fpcr.raw;
+        if (S) buf[i++] = shoe.fpsr.raw;
+        if (I) buf[i++] = shoe.fpiar;
+
+        if (m == 0) {
+            shoe.d[r] = buf[0];
+            return ;
+        }
+        else if (m == 1) {
+            shoe.a[r] = buf[0];
+            return ;
+        }
+        else if (m == 3)
+            addr = shoe.a[r];
+        else if (m == 4)
+            addr = shoe.a[r] - size;
+        else {
+            call_ea_addr(M);
+            addr = shoe.dat;
+        }
+        
+        for (i=0; i<count; i++) {
+            lset(addr + (i*4), 4, buf[i]);
+            if (shoe.abort)
+                return ;
+        }
+    }
+    else { // mem to reg
+        if (m == 0) // data reg
+            buf[0] = shoe.d[r];
+        else if (m == 1) // addr reg
+            buf[0] = shoe.a[r];
+        else {
+            if (m == 3) // post-increment
+                addr = shoe.a[r];
+            else if (m == 4) // pre-decrement
+                addr = shoe.a[r] - size;
+            else if (M == 0x3c) // immediate
+                addr = shoe.pc;
+            else {
+                call_ea_addr(M); // call_ea_addr() should work for all other modes
+                addr = shoe.dat;
+            }
+            
+            for (i=0; i<count; i++) {
+                buf[i] = lget(addr + (i*4), 4);
+                if (shoe.abort)
+                    return ;
+            }
+        }
+        
+        i = 0;
+        
+        if (C) {
+            uint8_t round = shoe.fpcr.b.mc_rnd;
+            shoe.fpcr.raw = buf[i++];
+            uint8_t newround = shoe.fpcr.b.mc_rnd;
+            
+            if (round != newround) {
+                slog("inst_fmovem_control: HEY: round %u -> %u\n", round, newround);
+            }
+        }
+        if (S) shoe.fpsr.raw = buf[i++];
+        if (I) shoe.fpiar = buf[i++];
+        
+        // Commit immediate-EA-mode PC change
+        if (M == 0x3c)
+            shoe.pc += size;
+    }
+        
+    // Commit pre/post-inc/decrement
+    
+    if (m == 3)
+        shoe.a[r] += size;
+    if (m == 4)
+        shoe.a[r] -= size;
+    
+    
+    
+    slog("inst_fmove_control: notice: (EA = %u/%u %08x CSI = %u%u%u)\n", m, r, (uint32_t)shoe.dat, C, S, I);
+}
+
+void dis_fmovem_control(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 10d CSI 0000000000);
+    
+    if (d)
+        sprintf(dis.str, "fmovem.l [%u%u%u],%s\n", C, S, I, decode_ea_addr(M)); // <- XXX: decode_ea_addr() is the wrong function to use
+    else
+        sprintf(dis.str, "fmovem.l %s,[%u%u%u]\n", decode_ea_addr(M), C, S, I); // <- XXX: decode_ea_addr() is the wrong function to use
+}
+
+
+static uint8_t fpu_test_cc(uint8_t cc)
+{
+    const uint8_t z = shoe.fpsr.b.cc_z;
+    const uint8_t n = shoe.fpsr.b.cc_n;
+    const uint8_t nan = shoe.fpsr.b.cc_nan;
+    
+    switch (cc & 0x0f) {
+        case 0: // false
+            return 0;
+        case 1: // equal
+            return z;
+        case 2: // greater than
+            return !(nan || z || n);
+        case 3: // greater than or equal
+            return z || !(nan || n);
+        case 4: // less than
+            return n && !(nan || z);
+        case 5: // less than or equal
+            return z || (n && !nan);
+        case 6: // greater or less than
+            return !(nan || z);
+        case 7: // ordered
+            return !nan;
+        case 8: // unordered
+            return nan;
+        case 9: // not (greater or less than)
+            return nan || z;
+        case 10: // not (less than or equal)
+            return nan || !(n || z);
+        case 11: // not (less than)
+            return nan || (z || !n);
+        case 12: // not (greater than or equal)
+            return nan || (n && !z);
+        case 13: // not (greater than)
+            return nan || z || n;
+        case 14: // not equal
+            return !z;
+        case 15: // true
+            return 1;
+    }
+    
+    assert(0);
+    return 0;
+}
+
+void inst_fbcc(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 01 s 0bcccc); // b => raise BSUN if NaN
+    
+    uint32_t displacement;
+    if (s) {
+        const uint16_t ext2 = nextword();
+        displacement = (ext << 16) | ext2;
+    }
+    else {
+        const int16_t tmp = ext;
+        const int32_t tmp2 = tmp;
+        displacement = tmp2;
+    }
+    
+    if (b) {
+        slog("inst_fbcc: fixme: Got a CC that wants to set BSUN, not implemented\n");
+        //assert(0); // FIXME: implement BSUN, or uncomment this
+    }
+    
+    if (fpu_test_cc(c)) {
+        const uint32_t addr = shoe.orig_pc + 2 + displacement;
+        shoe.pc = addr;
+    }
+}
+
+const char *fpu_cc_names[32] = {
+    "f", "eq", "ogt", "oge", "olt", "ole", "ogl", "or",
+    "un", "ueq", "ugt", "uge", "ult", "ule", "ne", "t",
+    "sf", "seq", "gt", "ge", "lt", "le", "gl", "gle",
+    "ngle", "ngl", "nle", "nlt", "nge", "ngt", "sne", "st"
+};
+
+void dis_fbcc(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 01 s 0ccccc); // only the low 5 bits of cc are significant
+    
+    uint32_t displacement;
+    if (s) {
+        const uint16_t ext2 = dis_next_word();
+        displacement = (ext << 16) | ext2;
+    }
+    else {
+        const int16_t tmp = ext;
+        const int32_t tmp2 = tmp;
+        displacement = tmp2;
+    }
+    
+    const uint32_t addr = dis.orig_pc + 2 + displacement;
+    
+    sprintf(dis.str, "fb%s.%c *0x%08x", fpu_cc_names[c], "wl"[s], addr);
+}
+
+static void reverse_order(uint8_t *buf, const uint32_t size)
+{
+    uint32_t i;
+    for (i=0; i < (size/2); i++) {
+        const uint8_t tmp = buf[i];
+        buf[i] = buf[size-(1+i)];
+        buf[size-(1+i)] = tmp;
+    }
+}
+
+void inst_fmovem(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
+    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
+    
+    const uint8_t pre_mask = s ? shoe.d[y] : L; // pre-adjusted mask
+    
+    // Count the number of bits in the mask
+    uint32_t count, maskcpy = pre_mask;
+    for (count=0; maskcpy; maskcpy >>= 1)
+        count += (maskcpy & 1);
+    
+    const uint32_t size = count * 12;
+    
+    // for predecrement mode, the mask is reversed
+    uint8_t mask = 0;
+    if (m == 4) {
+        uint32_t i;
+        for (i=0; i < 8; i++) {
+            const uint8_t bit = (pre_mask << i) & 0x80;
+            mask = (mask >> 1) | bit;
+        }
+    }
+    else
+        mask = pre_mask;
+    
+    uint32_t i, addr;
+    
+    // Find the EA
+    if (m == 3) {
+        addr = shoe.a[r];
+        assert(p); // assert post-increment mask
+    }
+    else if (m == 4) {
+        addr = shoe.a[r] - size;
+        assert(!p); // assert pre-decrement mask
+    }
+    else {
+        call_ea_addr(M);
+        addr = shoe.dat;
+        assert(p); // assert post-increment mask
+    }
+    
+    slog("inst_fmovem: pre=%08x mask=%08x EA=%u/%u addr=0x%08x size=%u %s\n", pre_mask, mask, m, r, addr, size, d?"to mem":"from mem");
+    
+    if (d) {
+        // Write those registers
+        for (i=0; i<8; i++) {
+            if (!(mask & (0x80 >> i)))
+                continue;
+            
+            uint8_t buf[12];
+            x87_to_motorola(shoe.fp[i], buf);
+            
+            slog("inst_fmovem: writing %Lf from fp%u", shoe.fp[i], i);
+            uint32_t j;
+            for (j=0; j<12; j++) {
+                slog(" %02x", buf[j]);
+                lset(addr, 1, buf[j]);
+                addr++;
+                if (shoe.abort)
+                    return ;
+            }
+            slog("\n");
+        }
+    }
+    else {
+        // Read those registers
+        for (i=0; i<8; i++) {
+            if (!(mask & (0x80 >> i)))
+                continue;
+            
+            uint8_t buf[12];
+            uint32_t j;
+            for (j=0; j<12; j++) {
+                buf[j] = lget(addr, 1);
+                addr++;
+                if (shoe.abort)
+                    return ;
+            }
+            shoe.fp[i] = motorola_to_x87(buf);
+            
+            slog("inst_fmovem: read %Lf to fp%u\n", shoe.fp[i], i);
+        }
+    }
+    
+    // Commit the write for pre/post-inc/decrement
+    if (m == 3)
+        shoe.a[r] += size;
+    else if (m == 4)
+        shoe.a[r] -= size;
+    
+    //slog("inst_fmovem: notice: not implemented (EA = %u/%u, mask=0x%02x)\n", m, r, mask);
+    
+}
+
+void dis_fmovem(uint16_t op, uint16_t ext)
+{
+    ~decompose(op,  1111 001 000 mmmrrr);
+    ~decompose(op,  1111 001 000 MMMMMM);
+    ~decompose(ext, 11 d ps 000 LLLLLLLL); // Static register mask
+    ~decompose(ext, 11 0 00 000 0yyy0000); // Register for dynamic mode
+    
+    sprintf(dis.str, "fmovem ???");
+}
+
+enum {
+    format_L = 0,
+    format_S = 1,
+    format_X = 2,
+    format_Ps = 3,
+    format_W = 4,
+    format_D = 5,
+    format_B = 6,
+    format_Pd = 7
+} fpu_formats;
+/*
+ * 0 L     long word integer
+ * 1 S     single precision real
+ * 2 X     extended precision real
+ * 3 P{#k} packed decimal real with static k factor
+ * 4 W     word integer
+ * 5 D     double precision real
+ * 6 B     byte integer
+ * 7 P{Dn} packed decimal real with dynamic k factor
+ */
+
+static void fpu_read_ea_commit(uint8_t mr, uint8_t format)
+{
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    
+    if (m == 3)
+        shoe.a[r] += sizes[format];
+    else if (m == 4)
+        shoe.a[r] -= sizes[format];
+}
+
+// Note: fpu_read_ea modifies shoe.pc, fpu_read_ea_commit modies shoe.a[r] for pre/post-inc/decrement
+static long double fpu_read_ea(uint8_t mr, uint8_t format)
+{
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    
+    long double data, result;
+    uint32_t addr;
+    
+    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
+    if ((m == 1) ||
+        ((m == 0) && (sizes[format] > 4))) {
+        throw_illegal_instruction();
+        return 0.0;
+    }
+    
+    switch (m) {
+        case 0: {
+            if (format == format_S) {
+                float tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_B) {
+                int8_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_W) {
+                int16_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            else if (format == format_L) {
+                int32_t tmp = shoe.d[r];
+                data = tmp;
+            }
+            
+            goto got_data;
+        }
+            
+        case 3:
+            addr = shoe.a[r];
+            assert(!( r==7 && sizes[format]==1));
+            goto got_address;
+            
+        case 4:
+            addr = shoe.a[r] - sizes[format];
+            assert(!( r==7 && sizes[format]==1));
+            goto got_address;
+            
+        case 7:
+            if (r == 4) {
+                addr = shoe.pc;
+                shoe.pc += sizes[format];
+                goto got_address;
+            }
+            
+            // fall through to default:
+            
+        default: {
+            
+            shoe.mr=mr;
+            ea_addr();
+            if (shoe.abort)
+                return 0.0;
+            
+            addr = (uint32_t)shoe.dat;
+            goto got_address;
+        }
+    }
+    
+got_address:
+    
+    {
+        uint8_t buf[12];
+        uint8_t *ptr = &buf[sizes[format]];
+        uint32_t i;
+        
+        slog("inst_f fpu_read_ea: format=%u, data =", format);
+        for (i=0; i<sizes[format]; i++) {
+            ptr--;
+            *ptr = lget(addr+i, 1);
+            slog(" %02x", *ptr);
+            if (shoe.abort)
+                return 0.0;
+        }
+        
+        switch (format) {
+            case format_B: {
+                int8_t tmp = ptr[0];
+                data = tmp;
+                break;
+            }
+            case format_W: {
+                int16_t tmp = *(int16_t*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_L: {
+                int32_t tmp = *(int32_t*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_S: {
+                float tmp = *(float*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_D: {
+                double tmp = *(double*)ptr;
+                data = tmp;
+                break;
+            }
+            case format_X: {
+                reverse_order(ptr, 12);
+                data = motorola_to_x87(ptr);
+                break;
+            }
+            default: {
+                assert(!"unsupported format (packed something)");
+            }
+        }
+    }
+    
+got_data:
+    
+    fpu_set_round();
+    result = data;
+    fpu_reset_round();
+    slog(" data=%Lf result=%Lf\n", data, result);
+    return result;
+}
+
+
+static void fpu_write_ea(uint8_t mr, uint8_t format, long double orig_data)
+{
+    fpu_set_round();
+    const long double data = orig_data;
+    fpu_reset_round();
+    
+    const uint8_t m = mr >> 3;
+    const uint8_t r = mr & 7;
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    uint8_t buf[12], *ptr = &buf[0];
+    uint32_t addr, i;
+    
+    // If mode==a-reg, or mode==data reg and the size is > 4 bytes, no dice
+    if ((m == 1) ||
+        ((m == 0) && (sizes[format] > 4))) {
+        throw_illegal_instruction();
+        return ;
+    }
+    
+    slog("inst_f fpu_write_ea EA=%u/%u data=%Lf format=%u\n", m, r, data, format);
+    
+    // Convert to the appropriate format
+    
+    switch (format) {
+        case format_B: {
+            int8_t tmp = data;
+            *((int8_t*)ptr) = tmp;
+            goto write_to_mem;
+        }
+        case format_W: {
+            int16_t tmp = data;
+            *((int16_t*)ptr) = tmp;
+            slog("inst_f fpu_write_ea formatted=%u (0x%04x)\n", *((int16_t*)ptr), *((uint16_t*)ptr));
+            break;
+        }
+        case format_L: {
+            int32_t tmp = data;
+            *((int32_t*)ptr) = tmp;
+            break;
+        }
+        case format_S: {
+            float tmp = data;
+            *((float*)ptr) = tmp;
+            break;
+        }
+        case format_D: {
+            double tmp = data;
+            *((double*)ptr) = tmp;
+            break;
+        }
+        case format_X: {
+            x87_to_motorola(data, ptr);
+            goto write_to_mem; // ptr is already big endian
+        }
+        default: {
+            assert(!"unsupported format (packed something)");
+        }
+    }
+
+swap_order:
+    reverse_order(buf, sizes[format]);
+    
+    
+write_to_mem:
+    // Lookup the EA
+
+    switch (m) {
+        case 0: {
+            if (format == format_B) {
+                int8_t tmp = data;
+                set_d(r, tmp, 1);
+            }
+            else if (format == format_W) {
+                int16_t tmp = data;
+                set_d(r, tmp, 2);
+            }
+            else if (format == format_L) {
+                int32_t tmp = data;
+                shoe.d[r] = tmp;
+            }
+            else if (format == format_S) {
+                float tmp = data;
+                *((float*)&shoe.d[r]) = tmp;
+            }
+            
+            goto done;
+        }
+        case 3: // post-increment
+            addr = shoe.a[r];
+            assert(!( r==7 && sizes[format]==1));
+            break;
+        case 4: // pre-decrement
+            addr = shoe.a[r] - sizes[format];
+            assert(!( r==7 && sizes[format]==1));
+            break;
+        default:
+            call_ea_addr(mr);
+            addr = (uint32_t)shoe.dat;
+            break;
+    }
+    
+    // Copy the formatted data into the EA
+    slog("inst_f fpu_write_ea: addr=0x%08x\n", addr);
+    for (i=0; i < sizes[format]; i++) {
+        lset(addr + i, 1, buf[i]);
+        if (shoe.abort)
+            return ;
+    }
+
+done: // set condition codes and update pre/post-inc/decrement registers
+    
+    // Set condition codes
+    shoe.fpsr.raw &= 0x00ffffff;
+    shoe.fpsr.b.cc_nan = (0 != isnan(data));
+    if (!shoe.fpsr.b.cc_nan) {
+        shoe.fpsr.b.cc_n = (0 != signbit(data));
+        if (isinf(data))
+            shoe.fpsr.b.cc_i = 1;
+        else
+            shoe.fpsr.b.cc_z = (data == 0.0);
+    }
+    
+    if (m == 3)
+        shoe.a[r] += sizes[format];
+    else if (m == 4)
+        shoe.a[r] -= sizes[format];
+}
+
+void inst_fmove(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 0 E V aaa zzz KKKKKKK);
+    
+    const uint8_t format = a;
+    
+    if (K == ~b(1000100) || K == ~b(1000000)) {
+        assert(!"inst_fmove: This is either a K-value, or somebody called fmove and specified the secret precision bits");
+    }
+    
+    // E==0 => Don't use EA (reg->reg)
+    // E==1 => Use EA
+    // V==0 => reg->reg or mem->reg
+    // V==1 => reg->mem
+    
+    // Load the source value into 'data'
+    
+    long double data;
+    
+    if (E && !V) { // mem -> reg
+        data = fpu_read_ea(M, format);
+        if (shoe.abort)
+            return ;
+    }
+    else if (!E) { // reg -> mem
+        data = shoe.fp[a];
+    }
+    else { // reg -> reg
+        data = shoe.fp[z];
+    }
+    
+    
+    // XXX: Check for exceptions?
+    
+    // Write the result
+    
+    if (E && V) { // reg -> mem
+        fpu_write_ea(M, format, data);
+        if (shoe.abort)
+            return ;
+    }
+    else if (!V) { // mem -> reg
+        fpu_set_reg_cc(data, z);
+        fpu_read_ea_commit(M, format);
+    }
+    else { // reg -> reg
+        fpu_set_reg_cc(data, z);
+    }
+    
+    const uint8_t sizes[8] = {4, 4, 12, 12, 2, 8, 1, 12};
+    slog("inst_fmove src=%Lf size=%u a=%u z=%u to-mem=%u useEA=%u EA = %u/%u\n", data, sizes[format], a, z, V, E, m, r);
+}
+
+void dis_fnop(uint16_t op, uint16_t ext)
+{
+    sprintf(dis.str, "fnop");
+}
+
+void inst_fnop(uint16_t op, uint16_t ext)
+{
+}
+
+void dis_fmove(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(op, 1111 001 000 mmmrrr);
+    ~decompose(ext, 0 E V aaa bbb KKKKKKK);
+    
+    // E==0 => reg to reg
+    // E==1 => mem to reg / reg to mem
+    // V==0 => reg->reg or mem->reg
+    // V==1 => reg->mem
+    
+    
+    sprintf(dis.str, "fmove ???");
+    
+}
+
+void dis_fmath(uint16_t op, uint16_t ext)
+{
+    sprintf(dis.str, "fmath ??");
+}
+
+static void fpu_set_fpsr_quotient(long double a, long double b, long double result)
+{
+    // Thanks for being super vague on the meaning of this register, 68881 documentation
+    
+    const long double quo = truncl((a - result) / b);
+    const uint8_t sign = signbit(quo);
+    const uint64_t quo_int = fabsl(quo);
+    
+    shoe.fpsr.b.qu_quotient = quo_int & 0x7f;
+    shoe.fpsr.b.qu_s = sign;
+}
+
+void inst_fmath(uint16_t op, uint16_t ext)
+{
+    ~decompose(op, 1111 001 000 MMMMMM);
+    ~decompose(ext, 0 a 0 sss ddd eeeeeee);
+    
+    const uint8_t src_in_ea = a;
+    const uint8_t source_specifier = s;
+    const uint8_t dest_register = d;
+    const uint8_t extension = e;
+    
+    uint8_t do_write_back_result = 1;
+    
+    long double source, dest, result;
+    
+    if (src_in_ea) {
+        source = fpu_read_ea(M, source_specifier);
+        slog("inst_fmath: source = %u/%u = %Lf", M>>3, M&7, source);
+        if ((M>>3) == 3)
+            slog(" a[%u]=0x%08x", M&7, shoe.a[M&7]);
+        
+        if (shoe.abort)
+            return ;
+    }
+    else {
+        source = shoe.fp[source_specifier];
+        slog("inst_fmath: source = fp%u = %Lf", source_specifier, source);
+    }
+    
+    dest = shoe.fp[dest_register];
+    slog("  dest = fp%u = %Lf\n", dest_register, dest);
+    
+    switch (e) {
+        case ~b(0000001): {// fpu_inst_fint
+            const uint8_t dir = shoe.fpcr.b.mc_rnd;
+            
+            // {FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD};
+            
+            if (dir == 0)
+                result = roundl(source);
+            else if (dir == 1)
+                result = truncl(source);
+            else if (dir == 2)
+                result = floorl(source);
+            else
+                result = ceill(source);
+            
+            slog("inst_fint: source = %Lf result = %Lf round=%u\n", source, result, dir);
+            
+            break;
+        }
+        case ~b(0000010): assert(!"fpu_inst_fsinh;");
+        case ~b(0000011): // fpu_inst_fintrz
+            slog("inst_fintrz dest = %Lf source = %Lf\n", dest, source);
+            result = truncl(source);
+            break;
+            
+        case ~b(0000110): // flognp1
+            slog("inst_flognp1 dest = %Lf source = %Lf\n", dest, source);
+            assert(source > -1.0);
+            result = log1pl(source);
+            break;
+        case ~b(0001000): assert(!"fpu_inst_fetoxm1;");
+        case ~b(0001001): assert(!"fpu_inst_ftanh;");
+        case ~b(0001010): // fatan
+            slog("inst_fatan dest = %Lf source = %Lf\n", dest, source);
+            result = atanl(source);
+            break;
+            
+        case ~b(0001100): assert(!"fpu_inst_fasin;");
+        case ~b(0001101): assert(!"fpu_inst_fatanh;");
+        case ~b(0001110): // fsin
+            slog("inst_fsin dest = %Lf source = %Lf\n", dest, source);
+            result = sinl(source);
+            break;
+        case ~b(0001111): assert(!"fpu_inst_ftan;");
+        case ~b(0010000): // fetox
+            slog("inst_fetox dest = %Lf source = %Lf\n", dest, source);
+            result = expl(source);
+            break;
+        case ~b(0010001): assert(!"fpu_inst_ftwotox;");
+        case ~b(0010010): assert(!"fpu_inst_ftentox;");
+        case ~b(0010100): assert(!"fpu_inst_flogn;");
+        case ~b(0010101): assert(!"fpu_inst_flog10;");
+        case ~b(0010110): assert(!"fpu_inst_flog2;");
+        case ~b(0011001): assert(!"fpu_inst_fcosh;");
+        case ~b(0011100): assert(!"fpu_inst_facos;");
+        case ~b(0011101): // fcos
+            slog("fpu_inst_fcos dest = %Lf source = %Lf\n", dest, source);
+            result = cosl(source);
+            break;
+            
+        case ~b(0011110): {// fpu_inst_fgetexp
+            if (!((source > 0) || (source < 0)))
+                result = source; // positive or negative zero
+            else if (!isfinite(source)) {
+                assert(!"fgetexp: isinfl(source)");
+                // returns NAN and an exception bit - not implemented for the moment
+            }
+            else {
+                // Extract the debiased exponent from the 80-bit source
+                uint8_t motorola[12];
+                x87_to_motorola(source, motorola);
+                int32_t exp = (motorola[0] & 0x7f) << 8;
+                exp |= motorola[1];
+                exp -= 16383; // debias
+                result = exp;
+            }
+            break;
+        }
+        case ~b(0011111): assert(!"fpu_inst_fgetman;");
+        case ~b(0100001):
+            // don't forget to set fpu_set_fpsr_quotient();
+            assert(!"fpu_inst_fmod;");
+        
+        case ~b(0100100): assert(!"fpu_inst_fsgldiv");
+            
+        case ~b(0100101): { // fpu_inst_frem
+            assert(source != 0.0);
+            result = remainderl(dest, source);
+            fpu_set_fpsr_quotient(dest, source, result);
+            slog("inst_frem: dest = %Lf source = %Lf quot = %u result = %Lf\n", dest, source, shoe.fpsr.b.qu_quotient, result);
+            break;
+        }
+        case ~b(0100110): assert(!"fpu_inst_fscale;");
+            
+        case ~b(0111000): { // fpu_inst_fcmp
+            const long double diff = dest - source;
+            slog("inst_fcmp: dest = %Lf source = %Lf\n", dest, source);
+            fpu_set_cc(diff);
+            do_write_back_result = 0; // don't write result back to register
+            break;
+        }
+        case ~b(0111010): { // fpu_inst_ftst
+            slog("fpu_inst_ftst: dest = %Lf\n");
+            fpu_set_cc(source);
+            do_write_back_result = 0; // don't write result back to register
+            break;
+        }
+            
+        case ~b(1011100):
+        case ~b(1011000):
+            assert(!"inst_fabs: can't handle");
+        case ~b(0011000):// fpu_inst_fabs
+            result = fabsl(source);
+            slog("inst_fabs: source=%Lf result=%Lf\n", source, result);
+            break;
+            
+        case ~b(1100010):
+        case ~b(1100110):
+            assert(!"can't handle");
+        case ~b(0100010): { // fpu_inst_fadd
+            slog("inst_fadd dest = %Lf source = %Lf\n", dest, source);
+            result = dest + source;
+            break;
+        }
+            
+        case ~b(1100000):
+        case ~b(1100100):
+            assert(!"can't handle");
+        case ~b(0100000): { // fpu_inst_fdiv
+            assert(source != 0.0);
+            slog("inst_fdiv dest = %Lf source = %Lf\n", dest, source);
+            
+            result = dest / source;
+            break;
+        }
+            
+            
+        case ~b(1100011):
+        case ~b(1100111):
+            assert(!"can't handle");
+        case ~b(0100011): { // fpu_inst_fmul
+            slog("inst_fmul dest = %Lf source = %Lf\n", dest, source);
+            result = source * dest;
+            break;
+        }
+            
+        case ~b(1011010):
+        case ~b(1011110):
+            assert(!"fneg: can't handle");
+        case ~b(0011010): // fneg
+            slog("inst_fneg dest = %Lf source = %Lf\n", dest, source);
+            result = -source;
+            break;
+            
+        case ~b(1000001):
+        case ~b(1000101):
+            assert(!"can't handle");
+        case ~b(0000100): { // fpu_inst_fsqrt
+            slog("inst_fsqrt dest = %Lf source = %Lf\n", dest, source);
+            result = sqrtl(source);
+            break;
+        }
+            
+        case ~b(1101000):
+        case ~b(1101100):
+            assert(!"can't handle");
+        case ~b(0101000): { // fpu_inst_fsub
+            slog("inst_fsub dest = %Lf source = %Lf\n", dest, source);
+            result = dest - source;
+            break;
+        }
+            
+        case ~b(0110000) ... ~b(0110111):
+            assert(!"fpu_inst_fsincos;");
+        
+        default:
+            assert(!"inst_fmath: unknown instruction");
+    }
+    
+    // Finalize the read, if source was in memory
+    if (src_in_ea) {
+        fpu_read_ea_commit(M, source_specifier);
+    }
+    
+    // Only write back the result if necessary (fcmp doesn't do this)
+    if (do_write_back_result) {
+        slog("inst_fmath: result = %Lf\n", result);
+        fpu_set_reg_cc(result, dest_register);
+    }
+}
+
+
+
+
+// Setup the jump table for fpu instructions
+// XXX: come up with a better, unified system for decoding instructions
+void fpu_setup_jump_table()
+{
+    uint32_t i;
+    
+    
+    fpu_inst_table[fpu_inst_fnop].emu = inst_fnop;
+    fpu_inst_table[fpu_inst_fnop].dis = dis_fnop;
+    
+    fpu_inst_table[fpu_inst_fbcc].emu = inst_fbcc;
+    fpu_inst_table[fpu_inst_fbcc].dis = dis_fbcc;
+    
+    fpu_inst_table[fpu_inst_fmovecr].emu = inst_fmovecr;
+    fpu_inst_table[fpu_inst_fmovecr].dis = dis_fmovecr;
+    
+    fpu_inst_table[fpu_inst_fmove].emu = inst_fmove;
+    fpu_inst_table[fpu_inst_fmove].dis = dis_fmove;
+    
+    fpu_inst_table[fpu_inst_fmovem].emu = inst_fmovem;
+    fpu_inst_table[fpu_inst_fmovem].dis = dis_fmovem;
+    
+    fpu_inst_table[fpu_inst_fmovem_control].emu = inst_fmovem_control;
+    fpu_inst_table[fpu_inst_fmovem_control].dis = dis_fmovem_control;
+    
+    fpu_inst_table[fpu_inst_frestore].emu = inst_frestore;
+    fpu_inst_table[fpu_inst_frestore].dis = dis_frestore;
+
+    fpu_inst_table[fpu_inst_fsave].emu = inst_fsave;
+    fpu_inst_table[fpu_inst_fsave].dis = dis_fsave;
+    
+    const fpu_inst_name_t _fmath[] = {
+        fpu_inst_fsincos,
+        fpu_inst_fint,
+        fpu_inst_fsinh,
+        fpu_inst_fintrz,
+        fpu_inst_flognp1,
+        fpu_inst_fetoxm1,
+        fpu_inst_ftanh,
+        fpu_inst_fatan,
+        fpu_inst_fatanh,
+        fpu_inst_fsin,
+        fpu_inst_ftan,
+        fpu_inst_fetox,
+        fpu_inst_ftwotox,
+        fpu_inst_ftentox,
+        fpu_inst_flogn,
+        fpu_inst_flog10,
+        fpu_inst_flog2,
+        fpu_inst_fcosh,
+        fpu_inst_facos,
+        fpu_inst_fcos,
+        fpu_inst_fgetexp,
+        fpu_inst_fgetman,
+        fpu_inst_fmod,
+        fpu_inst_fsgldiv,
+        fpu_inst_fsglmul,
+        fpu_inst_frem,
+        fpu_inst_fscale,
+        fpu_inst_fcmp,
+        fpu_inst_ftst,
+        fpu_inst_fabs,
+        fpu_inst_fadd,
+        fpu_inst_fdiv,
+        fpu_inst_fmul,
+        fpu_inst_fneg,
+        fpu_inst_fsqrt,
+        fpu_inst_fsub
+    };
+    
+    for (i=0; i < sizeof(_fmath) / sizeof(fpu_inst_name_t); i++) {
+        fpu_inst_table[_fmath[i]].emu = inst_fmath;
+        fpu_inst_table[_fmath[i]].dis = dis_fmath;
+    }
+}
+
+
+