Use new code generator. The gain is only 10%, bottlenecks are elsewhere.

Optimize Altivec vector splat instructions after Agner's guide.
2025-01-05 14:32:15 +00:00 · 2006-07-17 06:56:38 +00:00 · 2006-07-17 06:56:38 +00:00 · e07e2196e3
commit e07e2196e3
parent ceb43ce19a
10 changed files with 721 additions and 307 deletions
--- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp
@ -29,7 +29,7 @@ int __op_jmp0, __op_jmp1;
 #include "basic-dyngen-ops.hpp"

 basic_dyngen::basic_dyngen(dyngen_cpu_base cpu, int cache_size)
-	: parent_cpu(cpu), basic_jit_cache(cache_size)
+	: parent_cpu(cpu), jit_codegen(cache_size)
 {
 	execute_func = gen_start();
 	gen_op_execute();
--- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp
@ -22,7 +22,7 @@
 #define BASIC_DYNGEN_H

 #include "cpu/jit/jit-config.hpp"
-#include "cpu/jit/jit-cache.hpp"
+#include "cpu/jit/jit-codegen.hpp"

 // Set jump target address
 static inline void dg_set_jmp_target(uint8 *jmp_addr, uint8 *addr)
@ -57,7 +57,7 @@ typedef basic_cpu *dyngen_cpu_base;
 #endif

 class basic_dyngen
-	: public basic_jit_cache
+	: public jit_codegen
 {
 	uint8 *execute_func;
 	uint8 *gen_code_start;
--- a/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp
@ -54,8 +54,9 @@ protected:
 	// Initialize user code start
 	void set_code_start(uint8 *ptr);

-	// Get & increase current position
+	// Increase/set/get current position
 	void inc_code_ptr(int offset)	{ code_p += offset; }
+	void set_code_ptr(uint8 *ptr)	{ code_p = ptr; }
 public:
 	uint8 *code_ptr() const			{ return code_p; }

--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp
@ -374,6 +374,7 @@ private:
 	// Dynamic translation engine
 	friend class powerpc_dyngen_helper;
 	friend class powerpc_dyngen;
+	friend class powerpc_jit;
 	powerpc_jit codegen;
 	block_info *compile_block(uint32 entry);
 #if DYNGEN_DIRECT_BLOCK_CHAINING
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp
@ -1709,140 +1709,6 @@ void op_mtvscr_V0(void)
 #define __sse_clobbers(reglist...)
 #endif

-// SSE2 instructions
-#define DEFINE_OP(NAME, OP, VA, VB)									\
-void op_sse2_##NAME(void)											\
-{																	\
-	asm volatile ("movdqa (%1),%%xmm0\n"							\
-				  #OP   " (%2),%%xmm0\n"							\
-				  "movdqa %%xmm0,(%0)\n"							\
-				  : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB)	\
-				  : __sse_clobbers("xmm0"));						\
-}
-
-DEFINE_OP(vcmpequb, pcmpeqb, V0, V1);
-DEFINE_OP(vcmpequh, pcmpeqw, V0, V1);
-DEFINE_OP(vcmpequw, pcmpeqd, V0, V1);
-DEFINE_OP(vcmpgtsb, pcmpgtb, V0, V1);
-DEFINE_OP(vcmpgtsh, pcmpgtw, V0, V1);
-DEFINE_OP(vcmpgtsw, pcmpgtd, V0, V1);
-DEFINE_OP(vaddubm, paddb, V0, V1);
-DEFINE_OP(vadduhm, paddw, V0, V1);
-DEFINE_OP(vadduwm, paddd, V0, V1);
-DEFINE_OP(vsububm, psubb, V0, V1);
-DEFINE_OP(vsubuhm, psubw, V0, V1);
-DEFINE_OP(vsubuwm, psubd, V0, V1);
-DEFINE_OP(vand, pand, V0, V1);
-DEFINE_OP(vandc, pandn, V1, V0);
-DEFINE_OP(vor, por, V0, V1);
-DEFINE_OP(vxor, pxor, V0, V1);
-DEFINE_OP(vavgub, pavgb, V0, V1);
-DEFINE_OP(vavguh, pavgw, V0, V1);
-
-#undef DEFINE_OP
-
-#define DEFINE_OP(SH)										\
-void op_sse2_vsldoi_##SH(void)								\
-{															\
-	asm volatile ("movdqa (%1),%%xmm0\n"					\
-				  "movdqa (%2),%%xmm1\n"					\
-				  "pshufd %3,%%xmm0,%%xmm0\n"				\
-				  "pshufd %3,%%xmm1,%%xmm1\n"				\
-				  "pslldq %4,%%xmm0\n"						\
-				  "psrldq %5,%%xmm1\n"						\
-				  "por    %%xmm1,%%xmm0\n"					\
-				  "pshufd %3,%%xmm0,%%xmm0\n"				\
-				  "movdqa %%xmm0,(%0)\n"					\
-				  : :										\
-				  "r" (reg_VD), "r" (reg_V0), "r" (reg_V1),	\
-				  "i" (0x1b), "i" (SH), "i" (16 - SH)		\
-				  : __sse_clobbers("xmm0", "xmm1"));		\
-}
-
-DEFINE_OP(1);
-DEFINE_OP(2);
-DEFINE_OP(3);
-DEFINE_OP(4);
-DEFINE_OP(5);
-DEFINE_OP(6);
-DEFINE_OP(7);
-DEFINE_OP(8);
-DEFINE_OP(9);
-DEFINE_OP(10);
-DEFINE_OP(11);
-DEFINE_OP(12);
-DEFINE_OP(13);
-DEFINE_OP(14);
-DEFINE_OP(15);
-
-#undef DEFINE_OP
-
-// SSE instructions
-#define DEFINE_OP(NAME, OP, VA, VB)									\
-void op_sse_##NAME(void)											\
-{																	\
-	asm volatile ("movaps (%1),%%xmm0\n"							\
-				  #OP   " (%2),%%xmm0\n"							\
-				  "movaps %%xmm0,(%0)\n"							\
-				  : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB)	\
-				  : __sse_clobbers("xmm0"));						\
-}
-
-DEFINE_OP(vcmpeqfp, cmpeqps, V0, V1);
-DEFINE_OP(vcmpgefp, cmpleps, V1, V0);
-DEFINE_OP(vcmpgtfp, cmpltps, V1, V0);
-DEFINE_OP(vaddfp, addps, V0, V1);
-DEFINE_OP(vsubfp, subps, V0, V1);
-DEFINE_OP(vmaxfp, maxps, V0, V1);
-DEFINE_OP(vminfp, minps, V0, V1);
-DEFINE_OP(vand, andps, V0, V1);
-DEFINE_OP(vandc, andnps, V1, V0);
-DEFINE_OP(vor, orps, V0, V1);
-DEFINE_OP(vxor, xorps, V0, V1);
-DEFINE_OP(vminub, pminub, V0, V1);
-DEFINE_OP(vmaxub, pmaxub, V0, V1);
-DEFINE_OP(vminsh, pminsw, V0, V1);
-DEFINE_OP(vmaxsh, pmaxsw, V0, V1);
-
-#undef DEFINE_OP
-
-void op_sse_vmaddfp(void)
-{
-	asm volatile ("movaps (%1),%%xmm0\n"
-				  "mulps  (%3),%%xmm0\n"
-				  "addps  (%2),%%xmm0\n"
-				  "movaps %%xmm0,(%0)\n"
-				  : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2)
-				  : __sse_clobbers("xmm0"));
-}
-
-void op_sse_vnmsubfp(void)
-{
-	asm volatile ("movaps (%1),%%xmm0\n"
-				  "xorps  %%xmm1,%%xmm1\n"
-				  "mulps  (%3),%%xmm0\n"
-				  "subps  (%2),%%xmm0\n"
-				  "subps  %%xmm0,%%xmm1\n"
-				  "movaps %%xmm1,(%0)\n"
-				  : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2)
-				  : __sse_clobbers("xmm0", "xmm1"));
-}
-
-#define DEFINE_OP(VD, VS)								\
-void op_sse_mov_##VD##_##VS(void)						\
-{														\
-	asm volatile ("movaps (%1),%%xmm0\n"				\
-				  "movaps %%xmm0,(%0)\n"				\
-				  : : "r" (reg_##VD), "r" (reg_##VS)	\
-				  : __sse_clobbers("xmm0"));			\
-}
-
-DEFINE_OP(VD, V0);
-DEFINE_OP(VD, V1);
-DEFINE_OP(VD, V2);
-
-#undef DEFINE_OP
-
 // MMX instructions
 void op_emms(void)
 {
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp
@ -307,29 +307,3 @@ void powerpc_dyngen::gen_store_vect_VS_T0(int vS)
 	gen_load_ad_VD_VR(vS);
 	gen_op_store_vect_VD_T0();
 }
-
-void powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(int SH)
-{
-#if defined(__i386__) || defined(__x86_64__)
-	switch (SH) {
-#define GEN_OP(SH) case SH: gen_op_sse2_vsldoi_##SH(); break
-		GEN_OP(1);
-		GEN_OP(2);
-		GEN_OP(3);
-		GEN_OP(4);
-		GEN_OP(5);
-		GEN_OP(6);
-		GEN_OP(7);
-		GEN_OP(8);
-		GEN_OP(9);
-		GEN_OP(10);
-		GEN_OP(11);
-		GEN_OP(12);
-		GEN_OP(13);
-		GEN_OP(14);
-		GEN_OP(15);
-#undef GEN_OP
-	default: abort();
-	}
-#endif
-}
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp
@ -234,7 +234,6 @@ public:
 	DEFINE_ALIAS(record_cr6_VD,0);
 	DEFINE_ALIAS(mfvscr_VD,0);
 	DEFINE_ALIAS(mtvscr_V0,0);
-	void gen_sse2_vsldoi_VD_V0_V1(int SH);

 #undef DEFINE_ALIAS
 #undef DEFINE_ALIAS_0
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp
@ -19,8 +19,11 @@
 */

 #include "sysdeps.h"
+#include "cpu/jit/dyngen-exec.h"
 #include "cpu/ppc/ppc-jit.hpp"
+#include "cpu/ppc/ppc-cpu.hpp"
 #include "cpu/ppc/ppc-instructions.hpp"
+#include "cpu/ppc/ppc-operands.hpp"
 #include "utils/utils-cpuinfo.hpp"
 #include "utils/utils-sentinel.hpp"

@ -40,7 +43,6 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
 		static const jit_info_t jit_not_available = {
 			-1,
 			(gen_handler_t)&powerpc_jit::gen_not_available,
-			0
 		};
 		for (int i = 0; i < PPC_I(MAX); i++)
 			jit_info[i] = &jit_not_available;
@ -57,28 +59,54 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
 			DEFINE_OP(VANDC,	2, vandc_VD_V0_V1),
 			DEFINE_OP(VNOR,		2, vnor_VD_V0_V1),
 			DEFINE_OP(VOR,		2, vor_VD_V0_V1),
-			DEFINE_OP(VXOR,		2, vxor_VD_V0_V1)
+			DEFINE_OP(VXOR,		2, vxor_VD_V0_V1),
+			DEFINE_OP(MFVSCR,	1, mfvscr_VD),
+			DEFINE_OP(MTVSCR,	1, mtvscr_V0),
+#undef DEFINE_OP
+#define DEFINE_OP(MNEMO, GEN_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, }
+			DEFINE_OP(LVX,		load),
+			DEFINE_OP(LVXL,		load),
+			DEFINE_OP(LVEWX,	load_word),
+			DEFINE_OP(STVX,		store),
+			DEFINE_OP(STVXL,	store),
+			DEFINE_OP(STVEWX,	store_word),
 #undef DEFINE_OP
 		};
 		for (int i = 0; i < sizeof(gen_vector) / sizeof(gen_vector[0]); i++)
 			jit_info[gen_vector[i].mnemo] = &gen_vector[i];

 #if defined(__i386__) || defined(__x86_64__)
+		// x86 optimized handlers
+		static const jit_info_t x86_vector[] = {
+#define DEFINE_OP(MNEMO, GEN_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_x86_##GEN_OP, }
+			DEFINE_OP(MTVSCR,	mtvscr),
+			DEFINE_OP(MFVSCR,	mfvscr),
+			DEFINE_OP(LVX,		lvx),
+			DEFINE_OP(LVXL,		lvx),
+			DEFINE_OP(STVX,		stvx),
+			DEFINE_OP(STVXL,	stvx)
+#undef DEFINE_OP
+		};
+		for (int i = 0; i < sizeof(x86_vector) / sizeof(x86_vector[0]); i++)
+			jit_info[x86_vector[i].mnemo] = &x86_vector[i];
+
 		// MMX optimized handlers
 		static const jit_info_t mmx_vector[] = {
 #define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
-			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
 			DEFINE_OP(VADDUBM,	2, vaddubm),
 			DEFINE_OP(VADDUHM,	2, vadduhm),
 			DEFINE_OP(VADDUWM,	2, vadduwm),
 			DEFINE_OP(VAND,		2, vand),
 			DEFINE_OP(VANDC,	2, vandc),
-			DEFINE_OP(VCMPEQUB,	2, vcmpequb),
-			DEFINE_OP(VCMPEQUH,	2, vcmpequh),
-			DEFINE_OP(VCMPEQUW,	2, vcmpequw),
-			DEFINE_OP(VCMPGTSB,	2, vcmpgtsb),
-			DEFINE_OP(VCMPGTSH,	2, vcmpgtsh),
-			DEFINE_OP(VCMPGTSW,	2, vcmpgtsw),
+			DEFINE_OP(VCMPEQUB,	c, vcmpequb),
+			DEFINE_OP(VCMPEQUH,	c, vcmpequh),
+			DEFINE_OP(VCMPEQUW,	c, vcmpequw),
+			DEFINE_OP(VCMPGTSB,	c, vcmpgtsb),
+			DEFINE_OP(VCMPGTSH,	c, vcmpgtsh),
+			DEFINE_OP(VCMPGTSW,	c, vcmpgtsw),
 			DEFINE_OP(VOR,		2, vor),
 			DEFINE_OP(VSUBUBM,	2, vsububm),
 			DEFINE_OP(VSUBUHM,	2, vsubuhm),
@ -95,32 +123,38 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
 		static const jit_info_t sse_vector[] = {
 			// new MMX instructions brought into SSE capable CPUs
 #define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
-			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
 			DEFINE_OP(VMAXSH,	2, vmaxsh),
 			DEFINE_OP(VMAXUB,	2, vmaxub),
 			DEFINE_OP(VMINSH,	2, vminsh),
 			DEFINE_OP(VMINUB,	2, vminub),
 #undef DEFINE_OP
 			// full SSE instructions
-#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
-			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse_##DYNGEN_OP }
-			DEFINE_OP(VADDFP,	2, vaddfp),
-			DEFINE_OP(VAND,		2, vand),
-			DEFINE_OP(VANDC,	2, vandc),
-			DEFINE_OP(VCMPEQFP,	2, vcmpeqfp),
-			DEFINE_OP(VCMPGEFP,	2, vcmpgefp),
-			DEFINE_OP(VCMPGTFP,	2, vcmpgtfp),
-			DEFINE_OP(VMADDFP,	3, vmaddfp),
-			DEFINE_OP(VMAXFP,	2, vmaxfp),
-			DEFINE_OP(VMINFP,	2, vminfp),
-			DEFINE_OP(VNMSUBFP,	3, vnmsubfp),
-			DEFINE_OP(VOR,		2, vor),
-			DEFINE_OP(VSUBFP,	2, vsubfp),
-			DEFINE_OP(VXOR,		2, vxor),
-			DEFINE_OP(VMINUB,	2, vminub),
-			DEFINE_OP(VMAXUB,	2, vmaxub),
-			DEFINE_OP(VMINSH,	2, vminsh),
-			DEFINE_OP(VMAXSH,	2, vmaxsh)
+#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP }
+			DEFINE_OP(VADDFP,	2, PS,ADD),
+			DEFINE_OP(VAND,		2, PS,AND),
+			DEFINE_OP(VANDC,	s, PS,ANDN),
+			DEFINE_OP(VMAXFP,	2, PS,MAX),
+			DEFINE_OP(VMINFP,	2, PS,MIN),
+			DEFINE_OP(VOR,		2, PS,OR),
+			DEFINE_OP(VSUBFP,	2, PS,SUB),
+			DEFINE_OP(VXOR,		2, PS,XOR),
+			DEFINE_OP(VMINUB,	2, PI,PMINUB),
+			DEFINE_OP(VMAXUB,	2, PI,PMAXUB),
+			DEFINE_OP(VMINSH,	2, PI,PMINSW),
+			DEFINE_OP(VMAXSH,	2, PI,PMAXSW),
+#undef DEFINE_OP
+#define DEFINE_OP(MNEMO, COND) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_c, X86_SSE_CC_##COND }
+			DEFINE_OP(VCMPEQFP,	EQ),
+			DEFINE_OP(VCMPGEFP,	GE),
+			DEFINE_OP(VCMPGTFP,	GT),
+#undef DEFINE_OP
+#define DEFINE_OP(MNEMO, GEN_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_##GEN_OP }
+			DEFINE_OP(VMADDFP,	vmaddfp),
+			DEFINE_OP(VNMSUBFP,	vnmsubfp)
 #undef DEFINE_OP
 		};

@ -129,29 +163,39 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
 				jit_info[sse_vector[i].mnemo] = &sse_vector[i];
 		}

-		// generic altivec handlers
+		// SSE2 optimized handlers
 		static const jit_info_t sse2_vector[] = {
-#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
-			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse2_##DYNGEN_OP }
-			DEFINE_OP(VADDUBM,	2, vaddubm),
-			DEFINE_OP(VADDUHM,	2, vadduhm),
-			DEFINE_OP(VADDUWM,	2, vadduwm),
-			DEFINE_OP(VSUBUBM,	2, vsububm),
-			DEFINE_OP(VSUBUHM,	2, vsubuhm),
-			DEFINE_OP(VSUBUWM,	2, vsubuwm),
-			DEFINE_OP(VAND,		2, vand),
-			DEFINE_OP(VANDC,	2, vandc),
-			DEFINE_OP(VOR,		2, vor),
-			DEFINE_OP(VXOR,		2, vxor),
-			DEFINE_OP(VCMPEQUB,	2, vcmpequb),
-			DEFINE_OP(VCMPEQUH,	2, vcmpequh),
-			DEFINE_OP(VCMPEQUW,	2, vcmpequw),
-			DEFINE_OP(VCMPGTSB,	2, vcmpgtsb),
-			DEFINE_OP(VCMPGTSH,	2, vcmpgtsh),
-			DEFINE_OP(VCMPGTSW,	2, vcmpgtsw),
+#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP }
+			DEFINE_OP(VADDUBM,	2, PI,PADDB),
+			DEFINE_OP(VADDUHM,	2, PI,PADDW),
+			DEFINE_OP(VADDUWM,	2, PI,PADDD),
+			DEFINE_OP(VSUBUBM,	2, PI,PSUBB),
+			DEFINE_OP(VSUBUHM,	2, PI,PSUBW),
+			DEFINE_OP(VSUBUWM,	2, PI,PSUBD),
+			DEFINE_OP(VAND,		2, PI,PAND),
+			DEFINE_OP(VANDC,	s, PI,PANDN),
+			DEFINE_OP(VOR,		2, PI,POR),
+			DEFINE_OP(VXOR,		2, PI,PXOR),
+			DEFINE_OP(VCMPEQUB,	c, PI,PCMPEQB),
+			DEFINE_OP(VCMPEQUH,	c, PI,PCMPEQW),
+			DEFINE_OP(VCMPEQUW,	c, PI,PCMPEQD),
+			DEFINE_OP(VCMPGTSB,	c, PI,PCMPGTB),
+			DEFINE_OP(VCMPGTSH,	c, PI,PCMPGTW),
+			DEFINE_OP(VCMPGTSW,	c, PI,PCMPGTD),
+			DEFINE_OP(VREFP,	2, PS,RCP),
+			DEFINE_OP(VRSQRTEFP,2, PS,RSQRT),
+#undef DEFINE_OP
+#define DEFINE_OP(MNEMO, GEN_OP) \
+			{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_##GEN_OP, }
+			DEFINE_OP(VSLDOI,	vsldoi),
+			DEFINE_OP(VSPLTB,	vspltb),
+			DEFINE_OP(VSPLTH,	vsplth),
+			DEFINE_OP(VSPLTW,	vspltw),
+			DEFINE_OP(VSPLTISB,	vspltisb),
+			DEFINE_OP(VSPLTISH,	vspltish),
+			DEFINE_OP(VSPLTISW,	vspltisw)
 #undef DEFINE_OP
-			{ PPC_I(VSLDOI),
-			  (gen_handler_t)&powerpc_jit::gen_sse2_vsldoi, 0 }
 		};

 		if (cpuinfo_check_sse2()) {
@ -163,72 +207,564 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
 }

 // Dispatch mid-level code generators
-bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc)
+bool powerpc_jit::gen_vector_1(int mnemo, int vD)
 {
-	return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB);
+	return (this->*((bool (powerpc_jit::*)(int, int))jit_info[mnemo]->handler))(mnemo, vD);
 }

-bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc)
+bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB)
 {
-	return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB, vC);
+	return (this->*((bool (powerpc_jit::*)(int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB);
+}
+
+bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC)
+{
+	return (this->*((bool (powerpc_jit::*)(int, int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, vC);
+}
+
+bool powerpc_jit::gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc)
+{
+	return (this->*((bool (powerpc_jit::*)(int, int, int, int, bool))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, Rc);
 }


-bool powerpc_jit::gen_not_available(int mnemo, bool Rc)
+bool powerpc_jit::gen_not_available(int mnemo)
 {
 	return false;
 }

-bool powerpc_jit::gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB)
+bool powerpc_jit::gen_vector_generic_1(int mnemo, int vD)
+{
+	gen_load_ad_VD_VR(vD);
+	(this->*(jit_info[mnemo]->o.dyngen_handler))();
+	return true;
+}
+
+bool powerpc_jit::gen_vector_generic_2(int mnemo, int vD, int vA, int vB)
 {
 	gen_load_ad_VD_VR(vD);
 	gen_load_ad_V0_VR(vA);
 	gen_load_ad_V1_VR(vB);
-	jit_info[mnemo]->dyngen_handler(this);
-	if (Rc)
-		gen_record_cr6_VD();
+	(this->*(jit_info[mnemo]->o.dyngen_handler))();
 	return true;
 }

-bool powerpc_jit::gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC)
+bool powerpc_jit::gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC)
 {
 	gen_load_ad_VD_VR(vD);
 	gen_load_ad_V0_VR(vA);
 	gen_load_ad_V1_VR(vB);
 	gen_load_ad_V2_VR(vC);
-	jit_info[mnemo]->dyngen_handler(this);
+	(this->*(jit_info[mnemo]->o.dyngen_handler))();
+	return true;
+}
+
+bool powerpc_jit::gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc)
+{
+	gen_vector_generic_2(mnemo, vD, vA, vB);
 	if (Rc)
 		gen_record_cr6_VD();
 	return true;
 }

-bool powerpc_jit::gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB)
+bool powerpc_jit::gen_vector_generic_load(int mnemo, int vD, int rA, int rB)
 {
+	// NOTE: T0/VD are clobbered in the following instructions!
+	gen_load_T0_GPR(rB);
+	if (rA != 0) {
+		gen_load_T1_GPR(rA);
+		gen_add_32_T0_T1();
+	}
+	gen_load_vect_VD_T0(vD);
+	return true;
+}
+
+bool powerpc_jit::gen_vector_generic_store(int mnemo, int vS, int rA, int rB)
+{
+	// NOTE: T0/VS are clobbered in the following instructions!
+	gen_load_T0_GPR(rB);
+	if (rA != 0) {
+		gen_load_T1_GPR(rA);
+		gen_add_32_T0_T1();
+	}
+	gen_store_vect_VS_T0(vS);
+	return true;
+}
+
+bool powerpc_jit::gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB)
+{
+	// NOTE: T0/VD are clobbered in the following instructions!
+	gen_load_T0_GPR(rB);
+	if (rA != 0) {
+		gen_load_T1_GPR(rA);
+		gen_add_32_T0_T1();
+	}
+	gen_load_word_VD_T0(vD);
+	return true;
+}
+
+bool powerpc_jit::gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB)
+{
+	// NOTE: T0/VS are clobbered in the following instructions!
+	gen_load_T0_GPR(rB);
+	if (rA != 0) {
+		gen_load_T1_GPR(rA);
+		gen_add_32_T0_T1();
+	}
+	gen_store_word_VS_T0(vS);
+	return true;
+}
+
+#define xPPC_FIELD(M)	((uintptr)&((powerpc_cpu *)0)->M)
+#define xPPC_GPR(N)		xPPC_FIELD(gpr(N))
+#define xPPC_VR(N)		xPPC_FIELD(vr(N))
+#define xPPC_CR			xPPC_FIELD(cr())
+#define xPPC_VSCR		xPPC_FIELD(vscr())
+
 #if defined(__i386__) || defined(__x86_64__)
+/*
+ *	X86 optimizations
+ */
+
+// mtvscr
+bool powerpc_jit::gen_x86_mtvscr(int mnemo, int vD)
+{
+	gen_mov_32(x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID), REG_T0_ID);
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VSCR, REG_CPU_ID));
+	return true;
+}
+
+// mfvscr
+bool powerpc_jit::gen_x86_mfvscr(int mnemo, int vB)
+{
+	gen_xor_32(REG_T0_ID, REG_T0_ID);
+	gen_mov_32(x86_memory_operand(xPPC_VSCR, REG_CPU_ID), REG_T1_ID);
+#if SIZEOF_VOID_P == 8
+	gen_mov_64(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID));
+#else
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID));
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 1*4, REG_CPU_ID));
+#endif
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 2*4, REG_CPU_ID));
+	gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vB) + 3*4, REG_CPU_ID));
+	return true;
+}
+
+// lvx, lvxl
+bool powerpc_jit::gen_x86_lvx(int mnemo, int vD, int rA, int rB)
+{
+	gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
+	if (rA != 0)
+		gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
+	gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
+#if SIZEOF_VOID_P == 8
+	gen_mov_64(x86_memory_operand(0, REG_T0_ID), REG_T1_ID);
+	gen_mov_64(x86_memory_operand(8, REG_T0_ID), REG_T2_ID);
+	gen_bswap_64(REG_T1_ID);
+	gen_bswap_64(REG_T2_ID);
+	gen_rol_64(x86_immediate_operand(32), REG_T1_ID);
+	gen_rol_64(x86_immediate_operand(32), REG_T2_ID);
+	gen_mov_64(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0, REG_CPU_ID));
+	gen_mov_64(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 8, REG_CPU_ID));
+#else
+	gen_mov_32(x86_memory_operand(0*4, REG_T0_ID), REG_T1_ID);
+	gen_mov_32(x86_memory_operand(1*4, REG_T0_ID), REG_T2_ID);
+	gen_bswap_32(REG_T1_ID);
+	gen_bswap_32(REG_T2_ID);
+	gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0*4, REG_CPU_ID));
+	gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 1*4, REG_CPU_ID));
+	gen_mov_32(x86_memory_operand(2*4, REG_T0_ID), REG_T1_ID);
+	gen_mov_32(x86_memory_operand(3*4, REG_T0_ID), REG_T2_ID);
+	gen_bswap_32(REG_T1_ID);
+	gen_bswap_32(REG_T2_ID);
+	gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 2*4, REG_CPU_ID));
+	gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID));
+#endif
+	return true;
+}
+
+// stvx, stvxl
+bool powerpc_jit::gen_x86_stvx(int mnemo, int vS, int rA, int rB)
+{
+	// NOTE: primitive scheduling
+	gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
+#if SIZEOF_VOID_P == 8
+	gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 0, REG_CPU_ID), REG_T1_ID);
+	gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 8, REG_CPU_ID), REG_T2_ID);
+	if (rA != 0)
+		gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
+	gen_bswap_64(REG_T1_ID);
+	gen_bswap_64(REG_T2_ID);
+	gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
+	gen_rol_64(x86_immediate_operand(32), REG_T1_ID);
+	gen_rol_64(x86_immediate_operand(32), REG_T2_ID);
+	gen_mov_64(REG_T1_ID, x86_memory_operand(0, REG_T0_ID));
+	gen_mov_64(REG_T2_ID, x86_memory_operand(8, REG_T0_ID));
+#else
+	gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 0*4, REG_CPU_ID), REG_T1_ID);
+	gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 1*4, REG_CPU_ID), REG_T2_ID);
+	if (rA != 0)
+		gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
+	gen_bswap_32(REG_T1_ID);
+	gen_bswap_32(REG_T2_ID);
+	gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
+	gen_mov_32(REG_T1_ID, x86_memory_operand(0*4, REG_T0_ID));
+	gen_mov_32(REG_T2_ID, x86_memory_operand(1*4, REG_T0_ID));
+	gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 2*4, REG_CPU_ID), REG_T1_ID);
+	gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 3*4, REG_CPU_ID), REG_T2_ID);
+	gen_bswap_32(REG_T1_ID);
+	gen_bswap_32(REG_T2_ID);
+	gen_mov_32(REG_T1_ID, x86_memory_operand(2*4, REG_T0_ID));
+	gen_mov_32(REG_T2_ID, x86_memory_operand(3*4, REG_T0_ID));
+#endif
+	return true;
+}
+
+/*
+ *	MMX optimizations
+ */
+
+// Generic MMX arith
+bool powerpc_jit::gen_mmx_arith_2(int mnemo, int vD, int vA, int vB)
+{
 	gen_load_ad_VD_VR(vD);
 	gen_load_ad_V0_VR(vA);
 	gen_load_ad_V1_VR(vB);
-	jit_info[mnemo]->dyngen_handler(this);
-	if (Rc)
-		gen_record_cr6_VD();
+	(this->*(jit_info[mnemo]->o.dyngen_handler))();
 	gen_op_emms();
 	return true;
-#endif
-	return false;
 }

-bool powerpc_jit::gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH)
+// MMX comparison
+bool powerpc_jit::gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
 {
-#if defined(__i386__) || defined(__x86_64__)
-	gen_load_ad_VD_VR(vD);
-	gen_load_ad_V0_VR(vA);
-	if (SH == 0)
-		gen_op_sse_mov_VD_V0();
-	else {
-		gen_load_ad_V1_VR(vB);
-		powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(SH);
+	gen_mmx_arith_2(mnemo, vD, vA, vB);
+	if (Rc)
+		gen_record_cr6_VD();
+	return true;
+}
+
+/*
+ *	SSE optimizations
+ */
+
+// Record CR6 (vD contains the result of the CMP instruction)
+void powerpc_jit::gen_sse_record_cr6(int vD)
+{
+	gen_xor_32(REG_T0_ID, REG_T0_ID);										// xor %t0,%t0
+	gen_xor_32(REG_T1_ID, REG_T1_ID);										// xor %t1,%t1
+	gen_insn(X86_INSN_SSE_PS, X86_SSE_MOVMSK, vD, REG_T2_ID);				// movmskps %v0,%t2
+	gen_cmp_32(x86_immediate_operand(0), REG_T2_ID);						// cmp $0,%t2
+	gen_setcc(X86_CC_Z, REG_T0_ID);											// sete %t0
+	gen_cmp_32(x86_immediate_operand(0xf), REG_T2_ID);						// cmp $0xf,%t1
+	gen_setcc(X86_CC_E, REG_T1_ID);											// sete %t1
+	gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID);	// %t2 = %t0 + %t1*4
+	gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID);			// mov $xPPC_CR(%cpu),%t0
+	gen_shl_32(x86_immediate_operand(5), REG_T2_ID);						// %t2 holds new cr6
+	gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID);				// and $0xffffff0f,%t0
+	gen_or_32(REG_T2_ID, REG_T0_ID);										// or %t2,%t0
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID));			// mov %t0,$xPPC_CR(%cpu)
+}
+
+// Generic SSE arith
+bool powerpc_jit::gen_sse_arith_2(int mnemo, int vD, int vA, int vB)
+{
+	gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	const uint16 insn = jit_info[mnemo]->o.value;
+	gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
+	gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	return true;
+}
+
+// Generic SSE arith with swapped operands (ANDPS)
+bool powerpc_jit::gen_sse_arith_s(int mnemo, int vD, int vA, int vB)
+{
+	return gen_sse_arith_2(mnemo, vD, vB, vA);
+}
+
+// SSE comparison (CMPPS)
+bool powerpc_jit::gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
+{
+	// NOTE: this uses swapped operands for GT, GE (no change for EQ)
+	gen_movaps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
+	gen_cmpps(jit_info[mnemo]->o.value, x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	if (Rc)
+		gen_sse_record_cr6(REG_V0_ID);
+	return true;
+}
+
+// vmaddfp
+bool powerpc_jit::gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC)
+{
+	gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID);
+	gen_addps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
+	gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	return true;
+}
+
+// vnmsubfp
+bool powerpc_jit::gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC)
+{
+	gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	gen_xorps(REG_V1_ID, REG_V1_ID);
+	gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID);
+	gen_subps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
+	gen_subps(REG_V0_ID, REG_V1_ID);
+	gen_movaps(REG_V1_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	return true;
+}
+
+/*
+ *	SSE2 optimizations
+ */
+
+// Record CR6 (vD contains the result of the CMP instruction)
+void powerpc_jit::gen_sse2_record_cr6(int vD)
+{
+	gen_xor_32(REG_T0_ID, REG_T0_ID);										// xor %t0,%t0
+	gen_xor_32(REG_T1_ID, REG_T1_ID);										// xor %t1,%t1
+	gen_pmovmskb(vD, REG_T2_ID);											// pmovmskb %v0,%t2
+	gen_cmp_32(x86_immediate_operand(0), REG_T2_ID);						// cmp $0,%t2
+	gen_setcc(X86_CC_Z, REG_T0_ID);											// sete %t0
+	gen_cmp_32(x86_immediate_operand(0xffff), REG_T2_ID);					// cmp $0xffff,%t1
+	gen_setcc(X86_CC_E, REG_T1_ID);											// sete %t1
+	gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID);	// %t2 = %t0 + %t1*4
+	gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID);			// mov $xPPC_CR(%cpu),%t0
+	gen_shl_32(x86_immediate_operand(5), REG_T2_ID);						// %t2 holds new cr6
+	gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID);				// and $0xffffff0f,%t0
+	gen_or_32(REG_T2_ID, REG_T0_ID);										// or %t2,%t0
+	gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID));			// mov %t0,$xPPC_CR(%cpu)
+}
+
+// Generic SSE2 arith
+bool powerpc_jit::gen_sse2_arith_2(int mnemo, int vD, int vA, int vB)
+{
+	gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	const uint16 insn = jit_info[mnemo]->o.value;
+	gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
+	gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	return true;
+}
+
+// Generic SSE2 arith with swapped operands (PANDN)
+bool powerpc_jit::gen_sse2_arith_s(int mnemo, int vD, int vA, int vB)
+{
+	return gen_sse2_arith_2(mnemo, vD, vB, vA);
+}
+
+// SSE2 comparison (PCMPEQ, PCMPGT)
+bool powerpc_jit::gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
+{
+	gen_sse2_arith_2(mnemo, vD, vA, vB);
+	if (Rc)
+		gen_sse2_record_cr6(REG_V0_ID);
+	return true;
+}
+
+// vsldoi
+bool powerpc_jit::gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH)
+{
+	// Optimize out vsldoi vX,vX,vB,0
+	if (SH == 0 && vA == vD)
+		return true;
+
+	gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
+	if (SH) {
+		gen_movdqa(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V1_ID);
+		gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID);
+		gen_pshufd(x86_immediate_operand(0x1b), REG_V1_ID, REG_V1_ID);
+		gen_pslldq(x86_immediate_operand(SH), REG_V0_ID);
+		gen_psrldq(x86_immediate_operand(16 - SH), REG_V1_ID);
+		gen_por(REG_V1_ID, REG_V0_ID);
+		gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID);
+	}
+	gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+	return true;
+}
+
+/*
+ *	Vector splat instructions
+ *
+ *  Reference: "Optimizing subroutines in assembly language", Agner, table 13.6
+ */
+
+void powerpc_jit::gen_sse2_vsplat(int vD, int rValue)
+{
+	gen_movd_lx(rValue, REG_V0_ID);
+	gen_pshufd(x86_immediate_operand(0), REG_V0_ID, REG_V0_ID);
+	gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+}
+
+// vspltisb
+bool powerpc_jit::gen_sse2_vspltisb(int mnemo, int vD, int SIMM)
+{
+	switch (SIMM) {
+	case 0:
+		gen_pxor(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 1:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		gen_packuswb(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 2:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		gen_psllw(x86_immediate_operand(1), REG_V0_ID);
+		gen_packuswb(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 3:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(14), REG_V0_ID);
+		gen_packuswb(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 4:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		gen_psllw(x86_immediate_operand(2), REG_V0_ID);
+		gen_packuswb(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case -1:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case -2:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psllw(x86_immediate_operand(1), REG_V0_ID);
+		gen_packsswb(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	{
+	  commit:
+		gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+		break;
+	}
+	default:
+		const uint32 value = ((uint8)SIMM) * 0x01010101;
+		gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
+		gen_sse2_vsplat(vD, REG_T0_ID);
+		break;
 	}
 	return true;
-#endif
-	return false;
 }
+
+// vspltish
+bool powerpc_jit::gen_sse2_vspltish(int mnemo, int vD, int SIMM)
+{
+	switch (SIMM) {
+	case 0:
+		gen_pxor(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 1:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		goto commit;
+	case 2:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		gen_psllw(x86_immediate_operand(1), REG_V0_ID);
+		goto commit;
+	case 3:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(14), REG_V0_ID);
+		goto commit;
+	case 4:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
+		gen_psllw(x86_immediate_operand(2), REG_V0_ID);
+		goto commit;
+	case -1:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case -2:
+		gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
+		gen_psllw(x86_immediate_operand(1), REG_V0_ID);
+		goto commit;
+	{
+	  commit:
+		gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+		break;
+	}
+	default:
+		const uint32 value = ((uint16)SIMM) * 0x10001;
+		gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
+		gen_sse2_vsplat(vD, REG_T0_ID);
+		break;
+	}
+	return true;
+}
+
+// vspltisw
+bool powerpc_jit::gen_sse2_vspltisw(int mnemo, int vD, int SIMM)
+{
+	switch (SIMM) {
+	case 0:
+		gen_pxor(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case 1:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		gen_psrld(x86_immediate_operand(31), REG_V0_ID);
+		goto commit;
+	case 2:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		gen_psrld(x86_immediate_operand(31), REG_V0_ID);
+		gen_pslld(x86_immediate_operand(1), REG_V0_ID);
+		goto commit;
+	case 3:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		gen_psrld(x86_immediate_operand(30), REG_V0_ID);
+		goto commit;
+	case 4:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		gen_psrld(x86_immediate_operand(31), REG_V0_ID);
+		gen_pslld(x86_immediate_operand(2), REG_V0_ID);
+		goto commit;
+	case -1:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		goto commit;
+	case -2:
+		gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
+		gen_pslld(x86_immediate_operand(1), REG_V0_ID);
+		goto commit;
+	{
+	  commit:
+		gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
+		break;
+	}
+	default:
+		const uint32 value = SIMM;
+		gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
+		gen_sse2_vsplat(vD, REG_T0_ID);
+	}
+	return true;
+}
+
+// vspltb
+bool powerpc_jit::gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB)
+{
+	const int N = ev_mixed::byte_element(UIMM & 15);
+	gen_mov_zx_8_32(x86_memory_operand(xPPC_VR(vB) + N * 1, REG_CPU_ID), REG_T0_ID);
+	gen_imul_32(x86_immediate_operand(0x01010101), REG_T0_ID, REG_T0_ID);
+	gen_sse2_vsplat(vD, REG_T0_ID);
+	return true;
+}
+
+// vsplth
+bool powerpc_jit::gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB)
+{
+	const int N = ev_mixed::half_element(UIMM & 7);
+	gen_mov_zx_16_32(x86_memory_operand(xPPC_VR(vB) + N * 2, REG_CPU_ID), REG_T0_ID);
+	gen_imul_32(x86_immediate_operand(0x10001), REG_T0_ID, REG_T0_ID);
+	gen_sse2_vsplat(vD, REG_T0_ID);
+	return true;
+}
+
+// vspltw
+bool powerpc_jit::gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB)
+{
+	const int N = UIMM & 3;
+	gen_mov_32(x86_memory_operand(xPPC_VR(vB) + N * 4, REG_CPU_ID), REG_T0_ID);
+	gen_sse2_vsplat(vD, REG_T0_ID);
+	return true;
+}
+#endif
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp
@ -30,25 +30,68 @@ struct powerpc_jit
 	// Default constructor
 	powerpc_jit(dyngen_cpu_base cpu, int cache_size = -1);

-	bool gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc = false);
-	bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc = false);
+	bool gen_vector_1(int mnemo, int vD);
+	bool gen_vector_2(int mnemo, int vD, int vA, int vB);
+	bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC);
+	bool gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc);

 private:
 	// Mid-level code generator info
 	typedef bool (powerpc_jit::*gen_handler_t)(int, bool);
+	typedef void (powerpc_dyngen::*dyngen_handler_t)(void);
+	union jit_option_t {
+		jit_option_t() { }
+		uintptr value;
+		jit_option_t(uintptr v) : value(v) { }
+		dyngen_handler_t dyngen_handler;
+		jit_option_t(dyngen_handler_t const & h) : dyngen_handler(h) { }
+	};
 	struct jit_info_t {
 		int mnemo;
 		gen_handler_t handler;
-		powerpc_dyngen::gen_handler_t dyngen_handler;
+		jit_option_t o;
 	};
 	static const jit_info_t *jit_info[];

 private:
-	bool gen_not_available(int mnemo, bool Rc);
-	bool gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB);
-	bool gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC);
-	bool gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB);
-	bool gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH);
+	bool gen_not_available(int mnemo);
+	bool gen_vector_generic_1(int mnemo, int vD);
+	bool gen_vector_generic_2(int mnemo, int vD, int vA, int vB);
+	bool gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC);
+	bool gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc);
+	bool gen_vector_generic_load(int mnemo, int vD, int rA, int rB);
+	bool gen_vector_generic_store(int mnemo, int vS, int rA, int rB);
+	bool gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB);
+	bool gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB);
+
+#if defined(__i386__) || defined(__x86_64__)
+	bool gen_x86_lvx(int mnemo, int vD, int rA, int rB);
+	bool gen_x86_lvewx(int mnemo, int vD, int rA, int rB);
+	bool gen_x86_stvx(int mnemo, int vS, int rA, int rB);
+	bool gen_x86_stvewx(int mnemo, int vS, int rA, int rB);
+	bool gen_x86_mtvscr(int mnemo, int vD);
+	bool gen_x86_mfvscr(int mnemo, int vB);
+	bool gen_mmx_arith_2(int mnemo, int vD, int vA, int vB);
+	bool gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
+	void gen_sse_record_cr6(int vD);
+	bool gen_sse_arith_2(int mnemo, int vD, int vA, int vB);
+	bool gen_sse_arith_s(int mnemo, int vD, int vA, int vB);
+	bool gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
+	bool gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC);
+	bool gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC);
+	void gen_sse2_record_cr6(int vD);
+	bool gen_sse2_arith_2(int mnemo, int vD, int vA, int vB);
+	bool gen_sse2_arith_s(int mnemo, int vD, int vA, int vB);
+	bool gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
+	bool gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH);
+	void gen_sse2_vsplat(int vD, int rValue);
+	bool gen_sse2_vspltisb(int mnemo, int vD, int SIMM);
+	bool gen_sse2_vspltish(int mnemo, int vD, int SIMM);
+	bool gen_sse2_vspltisw(int mnemo, int vD, int SIMM);
+	bool gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB);
+	bool gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB);
+	bool gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB);
+#endif
 };

 #endif /* PPC_JIT_H */
--- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp
+++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp
@ -1346,67 +1346,16 @@ powerpc_cpu::compile_block(uint32 entry_point)
 			break;
 		}
 #endif
-		// NOTE: A0/VD are clobbered in the following instructions!
 		case PPC_I(LVEWX):
 		case PPC_I(LVX):
 		case PPC_I(LVXL):
-		{
-			const int rA = rA_field::extract(opcode);
-			const int rB = rB_field::extract(opcode);
-			const int vD = vD_field::extract(opcode);
-			dg.gen_load_T0_GPR(rB);
-			if (rA != 0) {
-				dg.gen_load_T1_GPR(rA);
-				dg.gen_add_32_T0_T1();
-			}
-			switch (ii->mnemo) {
-			case PPC_I(LVEWX):	dg.gen_load_word_VD_T0(vD); break;
-			case PPC_I(LVX):	dg.gen_load_vect_VD_T0(vD); break;
-			case PPC_I(LVXL):	dg.gen_load_vect_VD_T0(vD); break;
-			}
-			break;
-		}
 		case PPC_I(STVEWX):
 		case PPC_I(STVX):
 		case PPC_I(STVXL):
-		{
-			const int rA = rA_field::extract(opcode);
-			const int rB = rB_field::extract(opcode);
-			const int vS = vS_field::extract(opcode);
-			dg.gen_load_T0_GPR(rB);
-			if (rA != 0) {
-				dg.gen_load_T1_GPR(rA);
-				dg.gen_add_32_T0_T1();
-			}
-			switch (ii->mnemo) {
-			case PPC_I(STVEWX):	dg.gen_store_word_VS_T0(vS); break;
-			case PPC_I(STVX):	dg.gen_store_vect_VS_T0(vS); break;
-			case PPC_I(STVXL):	dg.gen_store_vect_VS_T0(vS); break;
-			}
-			break;
-		}
-		case PPC_I(MFVSCR):
-		{
-			dg.gen_load_ad_VD_VR(vD_field::extract(opcode));
-			dg.gen_mfvscr_VD();
-			break;
-		}
-		case PPC_I(MTVSCR):
-		{
-			dg.gen_load_ad_V0_VR(vB_field::extract(opcode));
-			dg.gen_mtvscr_V0();
-			break;
-		}
-		case PPC_I(VSLDOI):
-		{
-			const int vD = vD_field::extract(opcode);
-			const int vA = vA_field::extract(opcode);
-			const int vB = vB_field::extract(opcode);
-			const int SH = vSH_field::extract(opcode);
-			if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH))
-				goto do_generic;
-			break;
-		}
+			assert(vD_field::mask() == vS_field::mask());
+			assert(vA_field::mask() == rA_field::mask());
+			assert(vB_field::mask() == rB_field::mask());
+			// fall-through
 		case PPC_I(VCMPEQFP):
 		case PPC_I(VCMPEQUB):
 		case PPC_I(VCMPEQUH):
@ -1420,7 +1369,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
 			const int vD = vD_field::extract(opcode);
 			const int vA = vA_field::extract(opcode);
 			const int vB = vB_field::extract(opcode);
-			if (!dg.gen_vector_2(ii->mnemo, vD, vA, vB, vRc_field::test(opcode)))
+			if (!dg.gen_vector_compare(ii->mnemo, vD, vA, vB, vRc_field::test(opcode)))
 				goto do_generic;
 			break;
 		}
@ -1443,6 +1392,8 @@ powerpc_cpu::compile_block(uint32 entry_point)
 		case PPC_I(VSUBUHM):
 		case PPC_I(VSUBUWM):
 		case PPC_I(VXOR):
+		case PPC_I(VREFP):
+		case PPC_I(VRSQRTEFP):
 		{
 			const int vD = vD_field::extract(opcode);
 			const int vA = vA_field::extract(opcode);
@ -1462,6 +1413,49 @@ powerpc_cpu::compile_block(uint32 entry_point)
 				goto do_generic;
 			break;
 		}
+		case PPC_I(VSLDOI):
+		{
+			const int vD = vD_field::extract(opcode);
+			const int vA = vA_field::extract(opcode);
+			const int vB = vB_field::extract(opcode);
+			const int SH = vSH_field::extract(opcode);
+			if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH))
+				goto do_generic;
+			break;
+		}
+		case PPC_I(MFVSCR):
+		{
+			if (!dg.gen_vector_1(ii->mnemo, vD_field::extract(opcode)))
+				goto do_generic;
+			break;
+		}
+		case PPC_I(MTVSCR):
+		{
+			if (!dg.gen_vector_1(ii->mnemo, vB_field::extract(opcode)))
+				goto do_generic;
+			break;
+		}
+		case PPC_I(VSPLTISB):
+		case PPC_I(VSPLTISH):
+		case PPC_I(VSPLTISW):
+		{
+			const int vD = vD_field::extract(opcode);
+			const int SIMM = op_sign_extend_5_32::apply(vUIMM_field::extract(opcode));
+			if (!dg.gen_vector_2(ii->mnemo, vD, SIMM, 0))
+				goto do_generic;
+			break;
+		}
+		case PPC_I(VSPLTB):
+		case PPC_I(VSPLTH):
+		case PPC_I(VSPLTW):
+		{
+			const int vD = vD_field::extract(opcode);
+			const int UIMM = vUIMM_field::extract(opcode);
+			const int vB = vB_field::extract(opcode);
+			if (!dg.gen_vector_2(ii->mnemo, vD, UIMM, vB))
+				goto do_generic;
+			break;
+		}
 		default:				// Direct call to instruction handler
 		{
 			typedef void (*func_t)(dyngen_cpu_base, uint32);