From 05a7453d54919624ae1beb0393b9eaa3ac336069 Mon Sep 17 00:00:00 2001 From: gbeauche <> Date: Sun, 13 Mar 2005 12:49:30 +0000 Subject: [PATCH] MMX/SSE/SSE2 optimizations are now converted to full inline assembly code, aka avoid use of (possibly broken) GCC intrinsics. Add some SSE2 optimizations. Translate VSLDOI, MFVSCR, MTVSCR instructions. AltiVec Fractal Carbon now shows more than 1 GFlops performance! --- SheepShaver/src/Unix/configure.ac | 11 - .../kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp | 365 ++++++++---------- .../src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp | 79 +++- .../src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp | 3 + .../src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp | 31 +- 5 files changed, 263 insertions(+), 226 deletions(-) diff --git a/SheepShaver/src/Unix/configure.ac b/SheepShaver/src/Unix/configure.ac index 7e4688e8..be543dd1 100644 --- a/SheepShaver/src/Unix/configure.ac +++ b/SheepShaver/src/Unix/configure.ac @@ -1063,17 +1063,6 @@ if [[ "x$EMULATED_PPC" = "xyes" ]]; then else DYNGEN_OP_FLAGS="$DYNGEN_OP_FLAGS -malign-functions=0" fi - saved_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$CPPFLAGS -mmmx" - AC_CHECK_HEADERS(mmintrin.h, [DYNGEN_OP_FLAGS="$DYNGEN_OP_FLAGS -mmmx"]) - CPPFLAGS="$CPPFLAGS -msse" - AC_CHECK_HEADERS(xmmintrin.h, [DYNGEN_OP_FLAGS="$DYNGEN_OP_FLAGS -msse"]) - CPPFLAGS="$CPPFLAGS -msse2" - AC_CHECK_HEADERS(emmintrin.h, [DYNGEN_OP_FLAGS="$DYNGEN_OP_FLAGS -msse2"]) - CPPFLAGS=$saved_CPPFLAGS - ;; - x86_64) - AC_CHECK_HEADERS(mmintrin.h xmmintrin.h emmintrin.h) ;; esac DYNGEN_OP_FLAGS="$DYNGEN_OP_FLAGS -finline-limit=10000 -g0" diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp index 2a388e94..57a547e8 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp @@ -86,6 +86,8 @@ struct powerpc_dyngen_helper { static inline void set_xer(uint32 value) { CPU->xer().set(value); } static inline uint32 get_vrsave() { return CPU->vrsave(); } static inline void set_vrsave(uint32 value) { CPU->vrsave() = value; } + static inline uint32 get_vscr() { return CPU->vscr().get(); } + static inline void set_vscr(uint32 value) { CPU->vscr().set(value); } static inline void record(int crf, int32 v) { CPU->record_cr(crf, v); } static inline powerpc_cr_register & cr() { return CPU->cr(); } static inline powerpc_xer_register & xer() { return CPU->xer(); } @@ -1563,6 +1565,19 @@ void op_record_cr6_VD(void) dyngen_barrier(); } +void op_mfvscr_VD(void) +{ + VD.w[0] = 0; + VD.w[1] = 0; + VD.w[2] = 0; + VD.w[3] = powerpc_dyngen_helper::get_vscr(); +} + +void op_mtvscr_V0(void) +{ + powerpc_dyngen_helper::set_vscr(V0.w[3]); +} + #undef VNONE #undef V16QI #undef V8HI @@ -1571,235 +1586,183 @@ void op_record_cr6_VD(void) #undef V4SF /** - * SSE optimizations + * X86 SIMD optimizations **/ -#if defined(__SSE__) && defined(HAVE_XMMINTRIN_H) -#include -#undef VD -#define VD *((__m128 *)reg_VD) -#undef V0 -#define V0 *((__m128 *)reg_V0) -#undef V1 -#define V1 *((__m128 *)reg_V1) -#undef V2 -#define V2 *((__m128 *)reg_V2) +#if defined(__i386__) || defined(__x86_64__) +#undef VD +#undef V0 +#undef V1 +#undef V2 -void op_sse_nop(void) -{ - asm volatile ("nop"); +// SSE2 instructions +#define DEFINE_OP(NAME, OP, VA, VB) \ +void op_sse2_##NAME(void) \ +{ \ + asm volatile ("movdqa (%1),%%xmm0\n" \ + #OP " (%2),%%xmm0\n" \ + "movaps %%xmm0,(%0)\n" \ + : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) : "xmm0"); \ } -void op_sse_vcmpeqfp(void) -{ - VD = _mm_cmpeq_ps(V0, V1); +DEFINE_OP(vcmpequb, pcmpeqb, V0, V1); +DEFINE_OP(vcmpequh, pcmpeqw, V0, V1); +DEFINE_OP(vcmpequw, pcmpeqd, V0, V1); +DEFINE_OP(vcmpgtsb, pcmpgtb, V0, V1); +DEFINE_OP(vcmpgtsh, pcmpgtw, V0, V1); +DEFINE_OP(vcmpgtsw, pcmpgtd, V0, V1); +DEFINE_OP(vaddubm, paddb, V0, V1); +DEFINE_OP(vadduhm, paddw, V0, V1); +DEFINE_OP(vadduwm, paddd, V0, V1); +DEFINE_OP(vsububm, psubb, V0, V1); +DEFINE_OP(vsubuhm, psubw, V0, V1); +DEFINE_OP(vsubuwm, psubd, V0, V1); +DEFINE_OP(vand, pand, V0, V1); +DEFINE_OP(vandc, pandn, V1, V0); +DEFINE_OP(vor, por, V0, V1); +DEFINE_OP(vxor, pxor, V0, V1); +DEFINE_OP(vavgub, pavgb, V0, V1); +DEFINE_OP(vavguh, pavgw, V0, V1); + +#undef DEFINE_OP + +#define DEFINE_OP(SH) \ +void op_sse2_vsldoi_##SH(void) \ +{ \ + asm volatile ("movdqa (%1),%%xmm0\n" \ + "movdqa (%2),%%xmm1\n" \ + "pshufd %3,%%xmm0,%%xmm0\n" \ + "pshufd %3,%%xmm1,%%xmm1\n" \ + "pslldq %4,%%xmm0\n" \ + "psrldq %5,%%xmm1\n" \ + "pshufd %3,%%xmm0,%%xmm0\n" \ + "pshufd %3,%%xmm1,%%xmm1\n" \ + "por %%xmm1,%%xmm0\n" \ + "movaps %%xmm0,(%0)\n" \ + : : \ + "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), \ + "i" (0x1b), "i" (SH), "i" (16 - SH) \ + : "xmm0", "xmm1"); \ } -void op_sse_vcmpgefp(void) -{ - VD = _mm_cmpge_ps(V0, V1); +DEFINE_OP(1); +DEFINE_OP(2); +DEFINE_OP(3); +DEFINE_OP(4); +DEFINE_OP(5); +DEFINE_OP(6); +DEFINE_OP(7); +DEFINE_OP(8); +DEFINE_OP(9); +DEFINE_OP(10); +DEFINE_OP(11); +DEFINE_OP(12); +DEFINE_OP(13); +DEFINE_OP(14); +DEFINE_OP(15); + +#undef DEFINE_OP + +// SSE instructions +#define DEFINE_OP(NAME, OP, VA, VB) \ +void op_sse_##NAME(void) \ +{ \ + asm volatile ("movaps (%1),%%xmm0\n" \ + #OP " (%2),%%xmm0\n" \ + "movaps %%xmm0,(%0)\n" \ + : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) : "xmm0"); \ } -void op_sse_vcmpgtfp(void) -{ - VD = _mm_cmpgt_ps(V0, V1); -} +DEFINE_OP(vcmpeqfp, cmpeqps, V0, V1); +DEFINE_OP(vcmpgefp, cmpleps, V1, V0); +DEFINE_OP(vcmpgtfp, cmpltps, V1, V0); +DEFINE_OP(vaddfp, addps, V0, V1); +DEFINE_OP(vsubfp, subps, V0, V1); +DEFINE_OP(vmaxfp, maxps, V0, V1); +DEFINE_OP(vminfp, minps, V0, V1); +DEFINE_OP(vand, andps, V0, V1); +DEFINE_OP(vandc, andnps, V1, V0); +DEFINE_OP(vor, orps, V0, V1); +DEFINE_OP(vxor, xorps, V0, V1); +DEFINE_OP(vminub, pminub, V0, V1); +DEFINE_OP(vmaxub, pmaxub, V0, V1); +DEFINE_OP(vminsh, pminsw, V0, V1); +DEFINE_OP(vmaxsh, pmaxsw, V0, V1); -void op_sse_vaddfp(void) -{ - VD = _mm_add_ps(V0, V1); -} - -void op_sse_vsubfp(void) -{ - VD = _mm_sub_ps(V0, V1); -} +#undef DEFINE_OP void op_sse_vmaddfp(void) { - VD = _mm_add_ps(_mm_mul_ps(V0, V2), V1); + asm volatile ("movaps (%1),%%xmm0\n" + "mulps (%3),%%xmm0\n" + "addps (%2),%%xmm0\n" + "movaps %%xmm0,(%0)\n" + : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2) : "xmm0"); } void op_sse_vnmsubfp(void) { - VD = _mm_sub_ps(_mm_setzero_ps(), _mm_sub_ps(_mm_mul_ps(V0, V2), V1)); + asm volatile ("movaps (%1),%%xmm0\n" + "xorps %%xmm1,%%xmm1\n" + "mulps (%3),%%xmm0\n" + "subps (%2),%%xmm0\n" + "subps %%xmm0,%%xmm1\n" + "movaps %%xmm1,(%0)\n" + : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2) : "xmm0", "xmm1"); } -void op_sse_vmaxfp(void) -{ - VD = _mm_max_ps(V0, V1); +#define DEFINE_OP(VD, VS) \ +void op_sse_mov_##VD##_##VS(void) \ +{ \ + asm volatile ("movaps (%1),%%xmm0\n" \ + "movaps %%xmm0,(%0)\n" \ + : : "r" (reg_##VD), "r" (reg_##VS) : "xmm0"); \ } -void op_sse_vminfp(void) -{ - VD = _mm_min_ps(V0, V1); -} +DEFINE_OP(VD, V0); +DEFINE_OP(VD, V1); +DEFINE_OP(VD, V2); -void op_sse_vand(void) -{ - VD = _mm_and_ps(V0, V1); -} - -void op_sse_vandc(void) -{ - VD = _mm_andnot_ps(V1, V0); -} - -void op_sse_vor(void) -{ - VD = _mm_or_ps(V0, V1); -} - -void op_sse_vxor(void) -{ - VD = _mm_xor_ps(V0, V1); -} -#endif - -/** - * MMX optimizations - **/ - -#if defined(__MMX__) && defined(HAVE_MMINTRIN_H) -#include -#undef VD -#define VD ((__m64 *)reg_VD) -#undef V0 -#define V0 ((__m64 *)reg_V0) -#undef V1 -#define V1 ((__m64 *)reg_V1) -#undef V2 -#define V2 ((__m64 *)reg_V2) - -void op_mmx_nop(void) -{ - asm volatile ("nop"); -} +#undef DEFINE_OP +// MMX instructions void op_emms(void) { - _mm_empty(); + asm volatile ("emms"); } -void op_mmx_vcmpequb(void) -{ - VD[0] = _mm_cmpeq_pi8(V0[0], V1[0]); - VD[1] = _mm_cmpeq_pi8(V0[1], V1[1]); +#define DEFINE_OP(NAME, OP, VA, VB) \ +void op_mmx_##NAME(void) \ +{ \ + asm volatile ("movq (%1),%%mm0\n" \ + "movq 8(%1),%%mm1\n" \ + #OP " (%2),%%mm0\n" \ + #OP " 8(%2),%%mm1\n" \ + "movq %%mm0,(%0)\n" \ + "movq %%mm1,8(%0)\n" \ + : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) : "mm0", "mm1"); \ } -void op_mmx_vcmpequh(void) -{ - VD[0] = _mm_cmpeq_pi16(V0[0], V1[0]); - VD[1] = _mm_cmpeq_pi16(V0[1], V1[1]); -} +DEFINE_OP(vcmpequb, pcmpeqb, V0, V1); +DEFINE_OP(vcmpequh, pcmpeqw, V0, V1); +DEFINE_OP(vcmpequw, pcmpeqd, V0, V1); +DEFINE_OP(vcmpgtsb, pcmpgtb, V0, V1); +DEFINE_OP(vcmpgtsh, pcmpgtw, V0, V1); +DEFINE_OP(vcmpgtsw, pcmpgtd, V0, V1); +DEFINE_OP(vaddubm, paddb, V0, V1); +DEFINE_OP(vadduhm, paddw, V0, V1); +DEFINE_OP(vadduwm, paddd, V0, V1); +DEFINE_OP(vsububm, psubb, V0, V1); +DEFINE_OP(vsubuhm, psubw, V0, V1); +DEFINE_OP(vsubuwm, psubd, V0, V1); +DEFINE_OP(vand, pand, V0, V1); +DEFINE_OP(vandc, pandn, V1, V0); +DEFINE_OP(vor, por, V0, V1); +DEFINE_OP(vxor, pxor, V0, V1); +DEFINE_OP(vmaxub, pmaxub, V0, V1); +DEFINE_OP(vminub, pminub, V0, V1); +DEFINE_OP(vmaxsh, pmaxsw, V0, V1); +DEFINE_OP(vminsh, pminsw, V0, V1); -void op_mmx_vcmpequw(void) -{ - VD[0] = _mm_cmpeq_pi32(V0[0], V1[0]); - VD[1] = _mm_cmpeq_pi32(V0[1], V1[1]); -} - -void op_mmx_vcmpgtsb(void) -{ - VD[0] = _mm_cmpgt_pi8(V0[0], V1[0]); - VD[1] = _mm_cmpgt_pi8(V0[1], V1[1]); -} - -void op_mmx_vcmpgtsh(void) -{ - VD[0] = _mm_cmpgt_pi16(V0[0], V1[0]); - VD[1] = _mm_cmpgt_pi16(V0[1], V1[1]); -} - -void op_mmx_vcmpgtsw(void) -{ - VD[0] = _mm_cmpgt_pi32(V0[0], V1[0]); - VD[1] = _mm_cmpgt_pi32(V0[1], V1[1]); -} - -void op_mmx_vaddubm(void) -{ - VD[0] = _mm_add_pi8(V0[0], V1[0]); - VD[1] = _mm_add_pi8(V0[1], V1[1]); -} - -void op_mmx_vadduhm(void) -{ - VD[0] = _mm_add_pi16(V0[0], V1[0]); - VD[1] = _mm_add_pi16(V0[1], V1[1]); -} - -void op_mmx_vadduwm(void) -{ - VD[0] = _mm_add_pi32(V0[0], V1[0]); - VD[1] = _mm_add_pi32(V0[1], V1[1]); -} - -void op_mmx_vsububm(void) -{ - VD[0] = _mm_sub_pi8(V0[0], V1[0]); - VD[1] = _mm_sub_pi8(V0[1], V1[1]); -} - -void op_mmx_vsubuhm(void) -{ - VD[0] = _mm_sub_pi16(V0[0], V1[0]); - VD[1] = _mm_sub_pi16(V0[1], V1[1]); -} - -void op_mmx_vsubuwm(void) -{ - VD[0] = _mm_sub_pi32(V0[0], V1[0]); - VD[1] = _mm_sub_pi32(V0[1], V1[1]); -} - -void op_mmx_vand(void) -{ - VD[0] = _mm_and_si64(V0[0], V1[0]); - VD[1] = _mm_and_si64(V0[1], V1[1]); -} - -void op_mmx_vandc(void) -{ - VD[0] = _mm_andnot_si64(V1[0], V0[0]); - VD[1] = _mm_andnot_si64(V1[1], V0[1]); -} - -void op_mmx_vor(void) -{ - VD[0] = _mm_or_si64(V0[0], V1[0]); - VD[1] = _mm_or_si64(V0[1], V1[1]); -} - -void op_mmx_vxor(void) -{ - VD[0] = _mm_xor_si64(V0[0], V1[0]); - VD[1] = _mm_xor_si64(V0[1], V1[1]); -} - -#if defined(__SSE__) -void op_mmx_vmaxub(void) -{ - VD[0] = _mm_max_pu8(V0[0], V1[0]); - VD[1] = _mm_max_pu8(V0[1], V1[1]); -} - -void op_mmx_vminub(void) -{ - VD[0] = _mm_min_pu8(V0[0], V1[0]); - VD[1] = _mm_min_pu8(V0[1], V1[1]); -} - -void op_mmx_vmaxsh(void) -{ - VD[0] = _mm_max_pi16(V0[0], V1[0]); - VD[1] = _mm_max_pi16(V0[1], V1[1]); -} - -void op_mmx_vminsh(void) -{ - VD[0] = _mm_min_pi16(V0[0], V1[0]); - VD[1] = _mm_min_pi16(V0[1], V1[1]); -} -#endif +#undef DEFINE_OP #endif diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp index 512a9e61..9ff6d048 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp @@ -396,7 +396,6 @@ powerpc_dyngen::vector_codegen(int insn) powerpc_dyngen::gen_handler_t powerpc_dyngen::vector_codegen_mmx(int insn) { -#ifdef HAVE_gen_op_mmx_nop if (!(cpu_features & HWCAP_I386_MMX)) return 0; @@ -423,14 +422,14 @@ powerpc_dyngen::vector_codegen_mmx(int insn) #undef GEN_OP } -#ifdef HAVE_gen_op_sse_nop if (gen_op.ptr()) return gen_op; + // new MMX instructions brought in SSE capable CPUs + // XXX: also available as AMD MMX extensions if (!(cpu_features & HWCAP_I386_SSE)) return 0; - /* XXX: is the MMX unit really used for those? */ switch (insn) { #define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_mmx_##NAME) case PPC_I(VMAXSH): gen_op = GEN_OP(vmaxsh); break; @@ -439,17 +438,12 @@ powerpc_dyngen::vector_codegen_mmx(int insn) case PPC_I(VMINUB): gen_op = GEN_OP(vminub); break; #undef GEN_OP } -#endif return gen_op; -#endif - - return 0; } powerpc_dyngen::gen_handler_t powerpc_dyngen::vector_codegen_sse(int insn) { -#ifdef HAVE_gen_op_sse_nop if (!(cpu_features & HWCAP_I386_SSE)) return 0; @@ -470,25 +464,84 @@ powerpc_dyngen::vector_codegen_sse(int insn) case PPC_I(VOR): gen_op = GEN_OP(vor); break; case PPC_I(VSUBFP): gen_op = GEN_OP(vsubfp); break; case PPC_I(VXOR): gen_op = GEN_OP(vxor); break; + case PPC_I(VMINUB): gen_op = GEN_OP(vminub); break; + case PPC_I(VMAXUB): gen_op = GEN_OP(vmaxub); break; + case PPC_I(VMINSH): gen_op = GEN_OP(vminsh); break; + case PPC_I(VMAXSH): gen_op = GEN_OP(vmaxsh); break; #undef GEN_OP } return gen_op; -#endif +} - return 0; +bool powerpc_dyngen::gen_vector_shift_octet(int vD, int vA, int vB, int SH) +{ + if (!(cpu_features & HWCAP_I386_SSE2)) + return false; + + gen_load_ad_VD_VR(vD); + gen_load_ad_V0_VR(vA); + if (SH == 0) + gen_op_sse_mov_VD_V0(); + else { + gen_load_ad_V1_VR(vB); + switch (SH) { +#define GEN_OP(SH) case SH: gen_op_sse2_vsldoi_##SH(); break + GEN_OP(1); + GEN_OP(2); + GEN_OP(3); + GEN_OP(4); + GEN_OP(5); + GEN_OP(6); + GEN_OP(7); + GEN_OP(8); + GEN_OP(9); + GEN_OP(10); + GEN_OP(11); + GEN_OP(12); + GEN_OP(13); + GEN_OP(14); + GEN_OP(15); +#undef GEN_OP + default: abort(); + } + } + return true; } powerpc_dyngen::gen_handler_t powerpc_dyngen::vector_codegen_sse2(int insn) { - return 0; + if (!(cpu_features & HWCAP_I386_SSE2)) + return 0; + + /* XXX: auto-generate the table with individual handlers */ + gen_handler_t gen_op = 0; + switch (insn) { +#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_sse2_##NAME) + case PPC_I(VADDUBM): gen_op = GEN_OP(vaddubm); break; + case PPC_I(VADDUHM): gen_op = GEN_OP(vadduhm); break; + case PPC_I(VADDUWM): gen_op = GEN_OP(vadduwm); break; + case PPC_I(VSUBUBM): gen_op = GEN_OP(vsububm); break; + case PPC_I(VSUBUHM): gen_op = GEN_OP(vsubuhm); break; + case PPC_I(VSUBUWM): gen_op = GEN_OP(vsubuwm); break; + case PPC_I(VAND): gen_op = GEN_OP(vand); break; + case PPC_I(VANDC): gen_op = GEN_OP(vandc); break; + case PPC_I(VOR): gen_op = GEN_OP(vor); break; + case PPC_I(VXOR): gen_op = GEN_OP(vxor); break; + case PPC_I(VCMPEQUB): gen_op = GEN_OP(vcmpequb); break; + case PPC_I(VCMPEQUH): gen_op = GEN_OP(vcmpequh); break; + case PPC_I(VCMPEQUW): gen_op = GEN_OP(vcmpequw); break; + case PPC_I(VCMPGTSB): gen_op = GEN_OP(vcmpgtsb); break; + case PPC_I(VCMPGTSH): gen_op = GEN_OP(vcmpgtsh); break; + case PPC_I(VCMPGTSW): gen_op = GEN_OP(vcmpgtsw); break; +#undef GEN_OP + } + return gen_op; } void powerpc_dyngen::gen_mmx_clear(void) { -#ifdef HAVE_gen_op_mmx_nop if (cpu_features & HWCAP_I386_MMX) gen_op_emms(); -#endif } #endif diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp index da181f23..14c1df51 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp @@ -233,6 +233,8 @@ public: void gen_store_word_VS_T0(int vS); void gen_store_vect_VS_T0(int vS); DEFINE_ALIAS(record_cr6_VD,0); + DEFINE_ALIAS(mfvscr_VD,0); + DEFINE_ALIAS(mtvscr_V0,0); // Code generators for AltiVec instructions gen_handler_t vector_codegen(int insn); @@ -241,6 +243,7 @@ public: gen_handler_t vector_codegen_sse(int insn); gen_handler_t vector_codegen_sse2(int insn); void gen_mmx_clear(void); + bool gen_vector_shift_octet(int vD, int vA, int vB, int SH); #endif #undef DEFINE_ALIAS diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp index 68c5d6ba..fc45d713 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp @@ -1377,12 +1377,38 @@ powerpc_cpu::compile_block(uint32 entry_point) } break; } + case PPC_I(MFVSCR): + { + dg.gen_load_ad_VD_VR(vD_field::extract(opcode)); + dg.gen_mfvscr_VD(); + break; + } + case PPC_I(MTVSCR): + { + dg.gen_load_ad_V0_VR(vB_field::extract(opcode)); + dg.gen_mtvscr_V0(); + break; + } +#if defined(__i386__) || defined(__x86_64__) + case PPC_I(VSLDOI): + { + const int vD = vD_field::extract(opcode); + const int vA = vA_field::extract(opcode); + const int vB = vB_field::extract(opcode); + const int SH = vSH_field::extract(opcode); + if (dg.gen_vector_shift_octet(vD, vA, vB, SH)) + break; + // fall through + } +#endif case PPC_I(VADDFP): case PPC_I(VADDUBM): case PPC_I(VADDUHM): case PPC_I(VADDUWM): case PPC_I(VAND): case PPC_I(VANDC): + case PPC_I(VAVGUB): + case PPC_I(VAVGUH): case PPC_I(VCMPEQFP): case PPC_I(VCMPEQUB): case PPC_I(VCMPEQUH): @@ -1413,7 +1439,10 @@ powerpc_cpu::compile_block(uint32 entry_point) /* XXX: analyze the block function */ bool mmx_used = false; - if ((gen_op = dg.vector_codegen_sse(ii->mnemo)).ptr()) { + if ((gen_op = dg.vector_codegen_sse2(ii->mnemo)).ptr()) { + /* SSE2 code generator available */ + } + else if ((gen_op = dg.vector_codegen_sse(ii->mnemo)).ptr()) { /* SSE code generator available */ } else if ((gen_op = dg.vector_codegen_mmx(ii->mnemo)).ptr()) {