From e07e2196e3627a00361f02bdd8558c4525c24464 Mon Sep 17 00:00:00 2001 From: gbeauche <> Date: Mon, 17 Jul 2006 06:56:38 +0000 Subject: [PATCH] Use new code generator. The gain is only 10%, bottlenecks are elsewhere. Optimize Altivec vector splat instructions after Agner's guide. --- .../src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp | 2 +- .../src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp | 4 +- .../src/kpx_cpu/src/cpu/jit/jit-cache.hpp | 3 +- .../src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp | 1 + .../kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp | 134 ---- .../src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp | 26 - .../src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp | 1 - .../src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp | 692 ++++++++++++++++-- .../src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp | 59 +- .../src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp | 106 ++- 10 files changed, 721 insertions(+), 307 deletions(-) diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp index 7c8d863e..fec37f70 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.cpp @@ -29,7 +29,7 @@ int __op_jmp0, __op_jmp1; #include "basic-dyngen-ops.hpp" basic_dyngen::basic_dyngen(dyngen_cpu_base cpu, int cache_size) - : parent_cpu(cpu), basic_jit_cache(cache_size) + : parent_cpu(cpu), jit_codegen(cache_size) { execute_func = gen_start(); gen_op_execute(); diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp index b0d5831c..f46e0fcc 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/basic-dyngen.hpp @@ -22,7 +22,7 @@ #define BASIC_DYNGEN_H #include "cpu/jit/jit-config.hpp" -#include "cpu/jit/jit-cache.hpp" +#include "cpu/jit/jit-codegen.hpp" // Set jump target address static inline void dg_set_jmp_target(uint8 *jmp_addr, uint8 *addr) @@ -57,7 +57,7 @@ typedef basic_cpu *dyngen_cpu_base; #endif class basic_dyngen - : public basic_jit_cache + : public jit_codegen { uint8 *execute_func; uint8 *gen_code_start; diff --git a/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp b/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp index 5ee6db87..9fd265c2 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/jit/jit-cache.hpp @@ -54,8 +54,9 @@ protected: // Initialize user code start void set_code_start(uint8 *ptr); - // Get & increase current position + // Increase/set/get current position void inc_code_ptr(int offset) { code_p += offset; } + void set_code_ptr(uint8 *ptr) { code_p = ptr; } public: uint8 *code_ptr() const { return code_p; } diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp index c7958ff1..f1f9ac3a 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-cpu.hpp @@ -374,6 +374,7 @@ private: // Dynamic translation engine friend class powerpc_dyngen_helper; friend class powerpc_dyngen; + friend class powerpc_jit; powerpc_jit codegen; block_info *compile_block(uint32 entry); #if DYNGEN_DIRECT_BLOCK_CHAINING diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp index 9d22a0e1..4a573541 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp @@ -1709,140 +1709,6 @@ void op_mtvscr_V0(void) #define __sse_clobbers(reglist...) #endif -// SSE2 instructions -#define DEFINE_OP(NAME, OP, VA, VB) \ -void op_sse2_##NAME(void) \ -{ \ - asm volatile ("movdqa (%1),%%xmm0\n" \ - #OP " (%2),%%xmm0\n" \ - "movdqa %%xmm0,(%0)\n" \ - : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) \ - : __sse_clobbers("xmm0")); \ -} - -DEFINE_OP(vcmpequb, pcmpeqb, V0, V1); -DEFINE_OP(vcmpequh, pcmpeqw, V0, V1); -DEFINE_OP(vcmpequw, pcmpeqd, V0, V1); -DEFINE_OP(vcmpgtsb, pcmpgtb, V0, V1); -DEFINE_OP(vcmpgtsh, pcmpgtw, V0, V1); -DEFINE_OP(vcmpgtsw, pcmpgtd, V0, V1); -DEFINE_OP(vaddubm, paddb, V0, V1); -DEFINE_OP(vadduhm, paddw, V0, V1); -DEFINE_OP(vadduwm, paddd, V0, V1); -DEFINE_OP(vsububm, psubb, V0, V1); -DEFINE_OP(vsubuhm, psubw, V0, V1); -DEFINE_OP(vsubuwm, psubd, V0, V1); -DEFINE_OP(vand, pand, V0, V1); -DEFINE_OP(vandc, pandn, V1, V0); -DEFINE_OP(vor, por, V0, V1); -DEFINE_OP(vxor, pxor, V0, V1); -DEFINE_OP(vavgub, pavgb, V0, V1); -DEFINE_OP(vavguh, pavgw, V0, V1); - -#undef DEFINE_OP - -#define DEFINE_OP(SH) \ -void op_sse2_vsldoi_##SH(void) \ -{ \ - asm volatile ("movdqa (%1),%%xmm0\n" \ - "movdqa (%2),%%xmm1\n" \ - "pshufd %3,%%xmm0,%%xmm0\n" \ - "pshufd %3,%%xmm1,%%xmm1\n" \ - "pslldq %4,%%xmm0\n" \ - "psrldq %5,%%xmm1\n" \ - "por %%xmm1,%%xmm0\n" \ - "pshufd %3,%%xmm0,%%xmm0\n" \ - "movdqa %%xmm0,(%0)\n" \ - : : \ - "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), \ - "i" (0x1b), "i" (SH), "i" (16 - SH) \ - : __sse_clobbers("xmm0", "xmm1")); \ -} - -DEFINE_OP(1); -DEFINE_OP(2); -DEFINE_OP(3); -DEFINE_OP(4); -DEFINE_OP(5); -DEFINE_OP(6); -DEFINE_OP(7); -DEFINE_OP(8); -DEFINE_OP(9); -DEFINE_OP(10); -DEFINE_OP(11); -DEFINE_OP(12); -DEFINE_OP(13); -DEFINE_OP(14); -DEFINE_OP(15); - -#undef DEFINE_OP - -// SSE instructions -#define DEFINE_OP(NAME, OP, VA, VB) \ -void op_sse_##NAME(void) \ -{ \ - asm volatile ("movaps (%1),%%xmm0\n" \ - #OP " (%2),%%xmm0\n" \ - "movaps %%xmm0,(%0)\n" \ - : : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) \ - : __sse_clobbers("xmm0")); \ -} - -DEFINE_OP(vcmpeqfp, cmpeqps, V0, V1); -DEFINE_OP(vcmpgefp, cmpleps, V1, V0); -DEFINE_OP(vcmpgtfp, cmpltps, V1, V0); -DEFINE_OP(vaddfp, addps, V0, V1); -DEFINE_OP(vsubfp, subps, V0, V1); -DEFINE_OP(vmaxfp, maxps, V0, V1); -DEFINE_OP(vminfp, minps, V0, V1); -DEFINE_OP(vand, andps, V0, V1); -DEFINE_OP(vandc, andnps, V1, V0); -DEFINE_OP(vor, orps, V0, V1); -DEFINE_OP(vxor, xorps, V0, V1); -DEFINE_OP(vminub, pminub, V0, V1); -DEFINE_OP(vmaxub, pmaxub, V0, V1); -DEFINE_OP(vminsh, pminsw, V0, V1); -DEFINE_OP(vmaxsh, pmaxsw, V0, V1); - -#undef DEFINE_OP - -void op_sse_vmaddfp(void) -{ - asm volatile ("movaps (%1),%%xmm0\n" - "mulps (%3),%%xmm0\n" - "addps (%2),%%xmm0\n" - "movaps %%xmm0,(%0)\n" - : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2) - : __sse_clobbers("xmm0")); -} - -void op_sse_vnmsubfp(void) -{ - asm volatile ("movaps (%1),%%xmm0\n" - "xorps %%xmm1,%%xmm1\n" - "mulps (%3),%%xmm0\n" - "subps (%2),%%xmm0\n" - "subps %%xmm0,%%xmm1\n" - "movaps %%xmm1,(%0)\n" - : : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2) - : __sse_clobbers("xmm0", "xmm1")); -} - -#define DEFINE_OP(VD, VS) \ -void op_sse_mov_##VD##_##VS(void) \ -{ \ - asm volatile ("movaps (%1),%%xmm0\n" \ - "movaps %%xmm0,(%0)\n" \ - : : "r" (reg_##VD), "r" (reg_##VS) \ - : __sse_clobbers("xmm0")); \ -} - -DEFINE_OP(VD, V0); -DEFINE_OP(VD, V1); -DEFINE_OP(VD, V2); - -#undef DEFINE_OP - // MMX instructions void op_emms(void) { diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp index 87defdad..e7f35048 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp @@ -307,29 +307,3 @@ void powerpc_dyngen::gen_store_vect_VS_T0(int vS) gen_load_ad_VD_VR(vS); gen_op_store_vect_VD_T0(); } - -void powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(int SH) -{ -#if defined(__i386__) || defined(__x86_64__) - switch (SH) { -#define GEN_OP(SH) case SH: gen_op_sse2_vsldoi_##SH(); break - GEN_OP(1); - GEN_OP(2); - GEN_OP(3); - GEN_OP(4); - GEN_OP(5); - GEN_OP(6); - GEN_OP(7); - GEN_OP(8); - GEN_OP(9); - GEN_OP(10); - GEN_OP(11); - GEN_OP(12); - GEN_OP(13); - GEN_OP(14); - GEN_OP(15); -#undef GEN_OP - default: abort(); - } -#endif -} diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp index 65d3c0f3..a4c5d852 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp @@ -234,7 +234,6 @@ public: DEFINE_ALIAS(record_cr6_VD,0); DEFINE_ALIAS(mfvscr_VD,0); DEFINE_ALIAS(mtvscr_V0,0); - void gen_sse2_vsldoi_VD_V0_V1(int SH); #undef DEFINE_ALIAS #undef DEFINE_ALIAS_0 diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp index d1cdb06b..8521df7c 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.cpp @@ -19,8 +19,11 @@ */ #include "sysdeps.h" +#include "cpu/jit/dyngen-exec.h" #include "cpu/ppc/ppc-jit.hpp" +#include "cpu/ppc/ppc-cpu.hpp" #include "cpu/ppc/ppc-instructions.hpp" +#include "cpu/ppc/ppc-operands.hpp" #include "utils/utils-cpuinfo.hpp" #include "utils/utils-sentinel.hpp" @@ -40,7 +43,6 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size) static const jit_info_t jit_not_available = { -1, (gen_handler_t)&powerpc_jit::gen_not_available, - 0 }; for (int i = 0; i < PPC_I(MAX); i++) jit_info[i] = &jit_not_available; @@ -57,28 +59,54 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size) DEFINE_OP(VANDC, 2, vandc_VD_V0_V1), DEFINE_OP(VNOR, 2, vnor_VD_V0_V1), DEFINE_OP(VOR, 2, vor_VD_V0_V1), - DEFINE_OP(VXOR, 2, vxor_VD_V0_V1) + DEFINE_OP(VXOR, 2, vxor_VD_V0_V1), + DEFINE_OP(MFVSCR, 1, mfvscr_VD), + DEFINE_OP(MTVSCR, 1, mtvscr_V0), +#undef DEFINE_OP +#define DEFINE_OP(MNEMO, GEN_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, } + DEFINE_OP(LVX, load), + DEFINE_OP(LVXL, load), + DEFINE_OP(LVEWX, load_word), + DEFINE_OP(STVX, store), + DEFINE_OP(STVXL, store), + DEFINE_OP(STVEWX, store_word), #undef DEFINE_OP }; for (int i = 0; i < sizeof(gen_vector) / sizeof(gen_vector[0]); i++) jit_info[gen_vector[i].mnemo] = &gen_vector[i]; #if defined(__i386__) || defined(__x86_64__) + // x86 optimized handlers + static const jit_info_t x86_vector[] = { +#define DEFINE_OP(MNEMO, GEN_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_x86_##GEN_OP, } + DEFINE_OP(MTVSCR, mtvscr), + DEFINE_OP(MFVSCR, mfvscr), + DEFINE_OP(LVX, lvx), + DEFINE_OP(LVXL, lvx), + DEFINE_OP(STVX, stvx), + DEFINE_OP(STVXL, stvx) +#undef DEFINE_OP + }; + for (int i = 0; i < sizeof(x86_vector) / sizeof(x86_vector[0]); i++) + jit_info[x86_vector[i].mnemo] = &x86_vector[i]; + // MMX optimized handlers static const jit_info_t mmx_vector[] = { #define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \ - { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP } + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP } DEFINE_OP(VADDUBM, 2, vaddubm), DEFINE_OP(VADDUHM, 2, vadduhm), DEFINE_OP(VADDUWM, 2, vadduwm), DEFINE_OP(VAND, 2, vand), DEFINE_OP(VANDC, 2, vandc), - DEFINE_OP(VCMPEQUB, 2, vcmpequb), - DEFINE_OP(VCMPEQUH, 2, vcmpequh), - DEFINE_OP(VCMPEQUW, 2, vcmpequw), - DEFINE_OP(VCMPGTSB, 2, vcmpgtsb), - DEFINE_OP(VCMPGTSH, 2, vcmpgtsh), - DEFINE_OP(VCMPGTSW, 2, vcmpgtsw), + DEFINE_OP(VCMPEQUB, c, vcmpequb), + DEFINE_OP(VCMPEQUH, c, vcmpequh), + DEFINE_OP(VCMPEQUW, c, vcmpequw), + DEFINE_OP(VCMPGTSB, c, vcmpgtsb), + DEFINE_OP(VCMPGTSH, c, vcmpgtsh), + DEFINE_OP(VCMPGTSW, c, vcmpgtsw), DEFINE_OP(VOR, 2, vor), DEFINE_OP(VSUBUBM, 2, vsububm), DEFINE_OP(VSUBUHM, 2, vsubuhm), @@ -95,32 +123,38 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size) static const jit_info_t sse_vector[] = { // new MMX instructions brought into SSE capable CPUs #define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \ - { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP } + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP } DEFINE_OP(VMAXSH, 2, vmaxsh), DEFINE_OP(VMAXUB, 2, vmaxub), DEFINE_OP(VMINSH, 2, vminsh), DEFINE_OP(VMINUB, 2, vminub), #undef DEFINE_OP // full SSE instructions -#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \ - { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse_##DYNGEN_OP } - DEFINE_OP(VADDFP, 2, vaddfp), - DEFINE_OP(VAND, 2, vand), - DEFINE_OP(VANDC, 2, vandc), - DEFINE_OP(VCMPEQFP, 2, vcmpeqfp), - DEFINE_OP(VCMPGEFP, 2, vcmpgefp), - DEFINE_OP(VCMPGTFP, 2, vcmpgtfp), - DEFINE_OP(VMADDFP, 3, vmaddfp), - DEFINE_OP(VMAXFP, 2, vmaxfp), - DEFINE_OP(VMINFP, 2, vminfp), - DEFINE_OP(VNMSUBFP, 3, vnmsubfp), - DEFINE_OP(VOR, 2, vor), - DEFINE_OP(VSUBFP, 2, vsubfp), - DEFINE_OP(VXOR, 2, vxor), - DEFINE_OP(VMINUB, 2, vminub), - DEFINE_OP(VMAXUB, 2, vmaxub), - DEFINE_OP(VMINSH, 2, vminsh), - DEFINE_OP(VMAXSH, 2, vmaxsh) +#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP } + DEFINE_OP(VADDFP, 2, PS,ADD), + DEFINE_OP(VAND, 2, PS,AND), + DEFINE_OP(VANDC, s, PS,ANDN), + DEFINE_OP(VMAXFP, 2, PS,MAX), + DEFINE_OP(VMINFP, 2, PS,MIN), + DEFINE_OP(VOR, 2, PS,OR), + DEFINE_OP(VSUBFP, 2, PS,SUB), + DEFINE_OP(VXOR, 2, PS,XOR), + DEFINE_OP(VMINUB, 2, PI,PMINUB), + DEFINE_OP(VMAXUB, 2, PI,PMAXUB), + DEFINE_OP(VMINSH, 2, PI,PMINSW), + DEFINE_OP(VMAXSH, 2, PI,PMAXSW), +#undef DEFINE_OP +#define DEFINE_OP(MNEMO, COND) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_c, X86_SSE_CC_##COND } + DEFINE_OP(VCMPEQFP, EQ), + DEFINE_OP(VCMPGEFP, GE), + DEFINE_OP(VCMPGTFP, GT), +#undef DEFINE_OP +#define DEFINE_OP(MNEMO, GEN_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_##GEN_OP } + DEFINE_OP(VMADDFP, vmaddfp), + DEFINE_OP(VNMSUBFP, vnmsubfp) #undef DEFINE_OP }; @@ -129,29 +163,39 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size) jit_info[sse_vector[i].mnemo] = &sse_vector[i]; } - // generic altivec handlers + // SSE2 optimized handlers static const jit_info_t sse2_vector[] = { -#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \ - { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse2_##DYNGEN_OP } - DEFINE_OP(VADDUBM, 2, vaddubm), - DEFINE_OP(VADDUHM, 2, vadduhm), - DEFINE_OP(VADDUWM, 2, vadduwm), - DEFINE_OP(VSUBUBM, 2, vsububm), - DEFINE_OP(VSUBUHM, 2, vsubuhm), - DEFINE_OP(VSUBUWM, 2, vsubuwm), - DEFINE_OP(VAND, 2, vand), - DEFINE_OP(VANDC, 2, vandc), - DEFINE_OP(VOR, 2, vor), - DEFINE_OP(VXOR, 2, vxor), - DEFINE_OP(VCMPEQUB, 2, vcmpequb), - DEFINE_OP(VCMPEQUH, 2, vcmpequh), - DEFINE_OP(VCMPEQUW, 2, vcmpequw), - DEFINE_OP(VCMPGTSB, 2, vcmpgtsb), - DEFINE_OP(VCMPGTSH, 2, vcmpgtsh), - DEFINE_OP(VCMPGTSW, 2, vcmpgtsw), +#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP } + DEFINE_OP(VADDUBM, 2, PI,PADDB), + DEFINE_OP(VADDUHM, 2, PI,PADDW), + DEFINE_OP(VADDUWM, 2, PI,PADDD), + DEFINE_OP(VSUBUBM, 2, PI,PSUBB), + DEFINE_OP(VSUBUHM, 2, PI,PSUBW), + DEFINE_OP(VSUBUWM, 2, PI,PSUBD), + DEFINE_OP(VAND, 2, PI,PAND), + DEFINE_OP(VANDC, s, PI,PANDN), + DEFINE_OP(VOR, 2, PI,POR), + DEFINE_OP(VXOR, 2, PI,PXOR), + DEFINE_OP(VCMPEQUB, c, PI,PCMPEQB), + DEFINE_OP(VCMPEQUH, c, PI,PCMPEQW), + DEFINE_OP(VCMPEQUW, c, PI,PCMPEQD), + DEFINE_OP(VCMPGTSB, c, PI,PCMPGTB), + DEFINE_OP(VCMPGTSH, c, PI,PCMPGTW), + DEFINE_OP(VCMPGTSW, c, PI,PCMPGTD), + DEFINE_OP(VREFP, 2, PS,RCP), + DEFINE_OP(VRSQRTEFP,2, PS,RSQRT), +#undef DEFINE_OP +#define DEFINE_OP(MNEMO, GEN_OP) \ + { PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_##GEN_OP, } + DEFINE_OP(VSLDOI, vsldoi), + DEFINE_OP(VSPLTB, vspltb), + DEFINE_OP(VSPLTH, vsplth), + DEFINE_OP(VSPLTW, vspltw), + DEFINE_OP(VSPLTISB, vspltisb), + DEFINE_OP(VSPLTISH, vspltish), + DEFINE_OP(VSPLTISW, vspltisw) #undef DEFINE_OP - { PPC_I(VSLDOI), - (gen_handler_t)&powerpc_jit::gen_sse2_vsldoi, 0 } }; if (cpuinfo_check_sse2()) { @@ -163,72 +207,564 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size) } // Dispatch mid-level code generators -bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc) +bool powerpc_jit::gen_vector_1(int mnemo, int vD) { - return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB); + return (this->*((bool (powerpc_jit::*)(int, int))jit_info[mnemo]->handler))(mnemo, vD); } -bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc) +bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB) { - return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB, vC); + return (this->*((bool (powerpc_jit::*)(int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB); +} + +bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC) +{ + return (this->*((bool (powerpc_jit::*)(int, int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, vC); +} + +bool powerpc_jit::gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc) +{ + return (this->*((bool (powerpc_jit::*)(int, int, int, int, bool))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, Rc); } -bool powerpc_jit::gen_not_available(int mnemo, bool Rc) +bool powerpc_jit::gen_not_available(int mnemo) { return false; } -bool powerpc_jit::gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB) +bool powerpc_jit::gen_vector_generic_1(int mnemo, int vD) +{ + gen_load_ad_VD_VR(vD); + (this->*(jit_info[mnemo]->o.dyngen_handler))(); + return true; +} + +bool powerpc_jit::gen_vector_generic_2(int mnemo, int vD, int vA, int vB) { gen_load_ad_VD_VR(vD); gen_load_ad_V0_VR(vA); gen_load_ad_V1_VR(vB); - jit_info[mnemo]->dyngen_handler(this); - if (Rc) - gen_record_cr6_VD(); + (this->*(jit_info[mnemo]->o.dyngen_handler))(); return true; } -bool powerpc_jit::gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC) +bool powerpc_jit::gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC) { gen_load_ad_VD_VR(vD); gen_load_ad_V0_VR(vA); gen_load_ad_V1_VR(vB); gen_load_ad_V2_VR(vC); - jit_info[mnemo]->dyngen_handler(this); + (this->*(jit_info[mnemo]->o.dyngen_handler))(); + return true; +} + +bool powerpc_jit::gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc) +{ + gen_vector_generic_2(mnemo, vD, vA, vB); if (Rc) gen_record_cr6_VD(); return true; } -bool powerpc_jit::gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB) +bool powerpc_jit::gen_vector_generic_load(int mnemo, int vD, int rA, int rB) { + // NOTE: T0/VD are clobbered in the following instructions! + gen_load_T0_GPR(rB); + if (rA != 0) { + gen_load_T1_GPR(rA); + gen_add_32_T0_T1(); + } + gen_load_vect_VD_T0(vD); + return true; +} + +bool powerpc_jit::gen_vector_generic_store(int mnemo, int vS, int rA, int rB) +{ + // NOTE: T0/VS are clobbered in the following instructions! + gen_load_T0_GPR(rB); + if (rA != 0) { + gen_load_T1_GPR(rA); + gen_add_32_T0_T1(); + } + gen_store_vect_VS_T0(vS); + return true; +} + +bool powerpc_jit::gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB) +{ + // NOTE: T0/VD are clobbered in the following instructions! + gen_load_T0_GPR(rB); + if (rA != 0) { + gen_load_T1_GPR(rA); + gen_add_32_T0_T1(); + } + gen_load_word_VD_T0(vD); + return true; +} + +bool powerpc_jit::gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB) +{ + // NOTE: T0/VS are clobbered in the following instructions! + gen_load_T0_GPR(rB); + if (rA != 0) { + gen_load_T1_GPR(rA); + gen_add_32_T0_T1(); + } + gen_store_word_VS_T0(vS); + return true; +} + +#define xPPC_FIELD(M) ((uintptr)&((powerpc_cpu *)0)->M) +#define xPPC_GPR(N) xPPC_FIELD(gpr(N)) +#define xPPC_VR(N) xPPC_FIELD(vr(N)) +#define xPPC_CR xPPC_FIELD(cr()) +#define xPPC_VSCR xPPC_FIELD(vscr()) + #if defined(__i386__) || defined(__x86_64__) +/* + * X86 optimizations + */ + +// mtvscr +bool powerpc_jit::gen_x86_mtvscr(int mnemo, int vD) +{ + gen_mov_32(x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID), REG_T0_ID); + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VSCR, REG_CPU_ID)); + return true; +} + +// mfvscr +bool powerpc_jit::gen_x86_mfvscr(int mnemo, int vB) +{ + gen_xor_32(REG_T0_ID, REG_T0_ID); + gen_mov_32(x86_memory_operand(xPPC_VSCR, REG_CPU_ID), REG_T1_ID); +#if SIZEOF_VOID_P == 8 + gen_mov_64(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID)); +#else + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID)); + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 1*4, REG_CPU_ID)); +#endif + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 2*4, REG_CPU_ID)); + gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vB) + 3*4, REG_CPU_ID)); + return true; +} + +// lvx, lvxl +bool powerpc_jit::gen_x86_lvx(int mnemo, int vD, int rA, int rB) +{ + gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID); + if (rA != 0) + gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID); + gen_and_32(x86_immediate_operand(-16), REG_T0_ID); +#if SIZEOF_VOID_P == 8 + gen_mov_64(x86_memory_operand(0, REG_T0_ID), REG_T1_ID); + gen_mov_64(x86_memory_operand(8, REG_T0_ID), REG_T2_ID); + gen_bswap_64(REG_T1_ID); + gen_bswap_64(REG_T2_ID); + gen_rol_64(x86_immediate_operand(32), REG_T1_ID); + gen_rol_64(x86_immediate_operand(32), REG_T2_ID); + gen_mov_64(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0, REG_CPU_ID)); + gen_mov_64(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 8, REG_CPU_ID)); +#else + gen_mov_32(x86_memory_operand(0*4, REG_T0_ID), REG_T1_ID); + gen_mov_32(x86_memory_operand(1*4, REG_T0_ID), REG_T2_ID); + gen_bswap_32(REG_T1_ID); + gen_bswap_32(REG_T2_ID); + gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0*4, REG_CPU_ID)); + gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 1*4, REG_CPU_ID)); + gen_mov_32(x86_memory_operand(2*4, REG_T0_ID), REG_T1_ID); + gen_mov_32(x86_memory_operand(3*4, REG_T0_ID), REG_T2_ID); + gen_bswap_32(REG_T1_ID); + gen_bswap_32(REG_T2_ID); + gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 2*4, REG_CPU_ID)); + gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID)); +#endif + return true; +} + +// stvx, stvxl +bool powerpc_jit::gen_x86_stvx(int mnemo, int vS, int rA, int rB) +{ + // NOTE: primitive scheduling + gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID); +#if SIZEOF_VOID_P == 8 + gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 0, REG_CPU_ID), REG_T1_ID); + gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 8, REG_CPU_ID), REG_T2_ID); + if (rA != 0) + gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID); + gen_bswap_64(REG_T1_ID); + gen_bswap_64(REG_T2_ID); + gen_and_32(x86_immediate_operand(-16), REG_T0_ID); + gen_rol_64(x86_immediate_operand(32), REG_T1_ID); + gen_rol_64(x86_immediate_operand(32), REG_T2_ID); + gen_mov_64(REG_T1_ID, x86_memory_operand(0, REG_T0_ID)); + gen_mov_64(REG_T2_ID, x86_memory_operand(8, REG_T0_ID)); +#else + gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 0*4, REG_CPU_ID), REG_T1_ID); + gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 1*4, REG_CPU_ID), REG_T2_ID); + if (rA != 0) + gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID); + gen_bswap_32(REG_T1_ID); + gen_bswap_32(REG_T2_ID); + gen_and_32(x86_immediate_operand(-16), REG_T0_ID); + gen_mov_32(REG_T1_ID, x86_memory_operand(0*4, REG_T0_ID)); + gen_mov_32(REG_T2_ID, x86_memory_operand(1*4, REG_T0_ID)); + gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 2*4, REG_CPU_ID), REG_T1_ID); + gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 3*4, REG_CPU_ID), REG_T2_ID); + gen_bswap_32(REG_T1_ID); + gen_bswap_32(REG_T2_ID); + gen_mov_32(REG_T1_ID, x86_memory_operand(2*4, REG_T0_ID)); + gen_mov_32(REG_T2_ID, x86_memory_operand(3*4, REG_T0_ID)); +#endif + return true; +} + +/* + * MMX optimizations + */ + +// Generic MMX arith +bool powerpc_jit::gen_mmx_arith_2(int mnemo, int vD, int vA, int vB) +{ gen_load_ad_VD_VR(vD); gen_load_ad_V0_VR(vA); gen_load_ad_V1_VR(vB); - jit_info[mnemo]->dyngen_handler(this); - if (Rc) - gen_record_cr6_VD(); + (this->*(jit_info[mnemo]->o.dyngen_handler))(); gen_op_emms(); return true; -#endif - return false; } -bool powerpc_jit::gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH) +// MMX comparison +bool powerpc_jit::gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc) { -#if defined(__i386__) || defined(__x86_64__) - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - if (SH == 0) - gen_op_sse_mov_VD_V0(); - else { - gen_load_ad_V1_VR(vB); - powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(SH); + gen_mmx_arith_2(mnemo, vD, vA, vB); + if (Rc) + gen_record_cr6_VD(); + return true; +} + +/* + * SSE optimizations + */ + +// Record CR6 (vD contains the result of the CMP instruction) +void powerpc_jit::gen_sse_record_cr6(int vD) +{ + gen_xor_32(REG_T0_ID, REG_T0_ID); // xor %t0,%t0 + gen_xor_32(REG_T1_ID, REG_T1_ID); // xor %t1,%t1 + gen_insn(X86_INSN_SSE_PS, X86_SSE_MOVMSK, vD, REG_T2_ID); // movmskps %v0,%t2 + gen_cmp_32(x86_immediate_operand(0), REG_T2_ID); // cmp $0,%t2 + gen_setcc(X86_CC_Z, REG_T0_ID); // sete %t0 + gen_cmp_32(x86_immediate_operand(0xf), REG_T2_ID); // cmp $0xf,%t1 + gen_setcc(X86_CC_E, REG_T1_ID); // sete %t1 + gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID); // %t2 = %t0 + %t1*4 + gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID); // mov $xPPC_CR(%cpu),%t0 + gen_shl_32(x86_immediate_operand(5), REG_T2_ID); // %t2 holds new cr6 + gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID); // and $0xffffff0f,%t0 + gen_or_32(REG_T2_ID, REG_T0_ID); // or %t2,%t0 + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID)); // mov %t0,$xPPC_CR(%cpu) +} + +// Generic SSE arith +bool powerpc_jit::gen_sse_arith_2(int mnemo, int vD, int vA, int vB) +{ + gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + const uint16 insn = jit_info[mnemo]->o.value; + gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID); + gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + return true; +} + +// Generic SSE arith with swapped operands (ANDPS) +bool powerpc_jit::gen_sse_arith_s(int mnemo, int vD, int vA, int vB) +{ + return gen_sse_arith_2(mnemo, vD, vB, vA); +} + +// SSE comparison (CMPPS) +bool powerpc_jit::gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc) +{ + // NOTE: this uses swapped operands for GT, GE (no change for EQ) + gen_movaps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID); + gen_cmpps(jit_info[mnemo]->o.value, x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + if (Rc) + gen_sse_record_cr6(REG_V0_ID); + return true; +} + +// vmaddfp +bool powerpc_jit::gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC) +{ + gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID); + gen_addps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID); + gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + return true; +} + +// vnmsubfp +bool powerpc_jit::gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC) +{ + gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + gen_xorps(REG_V1_ID, REG_V1_ID); + gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID); + gen_subps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID); + gen_subps(REG_V0_ID, REG_V1_ID); + gen_movaps(REG_V1_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + return true; +} + +/* + * SSE2 optimizations + */ + +// Record CR6 (vD contains the result of the CMP instruction) +void powerpc_jit::gen_sse2_record_cr6(int vD) +{ + gen_xor_32(REG_T0_ID, REG_T0_ID); // xor %t0,%t0 + gen_xor_32(REG_T1_ID, REG_T1_ID); // xor %t1,%t1 + gen_pmovmskb(vD, REG_T2_ID); // pmovmskb %v0,%t2 + gen_cmp_32(x86_immediate_operand(0), REG_T2_ID); // cmp $0,%t2 + gen_setcc(X86_CC_Z, REG_T0_ID); // sete %t0 + gen_cmp_32(x86_immediate_operand(0xffff), REG_T2_ID); // cmp $0xffff,%t1 + gen_setcc(X86_CC_E, REG_T1_ID); // sete %t1 + gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID); // %t2 = %t0 + %t1*4 + gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID); // mov $xPPC_CR(%cpu),%t0 + gen_shl_32(x86_immediate_operand(5), REG_T2_ID); // %t2 holds new cr6 + gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID); // and $0xffffff0f,%t0 + gen_or_32(REG_T2_ID, REG_T0_ID); // or %t2,%t0 + gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID)); // mov %t0,$xPPC_CR(%cpu) +} + +// Generic SSE2 arith +bool powerpc_jit::gen_sse2_arith_2(int mnemo, int vD, int vA, int vB) +{ + gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + const uint16 insn = jit_info[mnemo]->o.value; + gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID); + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + return true; +} + +// Generic SSE2 arith with swapped operands (PANDN) +bool powerpc_jit::gen_sse2_arith_s(int mnemo, int vD, int vA, int vB) +{ + return gen_sse2_arith_2(mnemo, vD, vB, vA); +} + +// SSE2 comparison (PCMPEQ, PCMPGT) +bool powerpc_jit::gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc) +{ + gen_sse2_arith_2(mnemo, vD, vA, vB); + if (Rc) + gen_sse2_record_cr6(REG_V0_ID); + return true; +} + +// vsldoi +bool powerpc_jit::gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH) +{ + // Optimize out vsldoi vX,vX,vB,0 + if (SH == 0 && vA == vD) + return true; + + gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID); + if (SH) { + gen_movdqa(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V1_ID); + gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID); + gen_pshufd(x86_immediate_operand(0x1b), REG_V1_ID, REG_V1_ID); + gen_pslldq(x86_immediate_operand(SH), REG_V0_ID); + gen_psrldq(x86_immediate_operand(16 - SH), REG_V1_ID); + gen_por(REG_V1_ID, REG_V0_ID); + gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID); + } + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + return true; +} + +/* + * Vector splat instructions + * + * Reference: "Optimizing subroutines in assembly language", Agner, table 13.6 + */ + +void powerpc_jit::gen_sse2_vsplat(int vD, int rValue) +{ + gen_movd_lx(rValue, REG_V0_ID); + gen_pshufd(x86_immediate_operand(0), REG_V0_ID, REG_V0_ID); + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); +} + +// vspltisb +bool powerpc_jit::gen_sse2_vspltisb(int mnemo, int vD, int SIMM) +{ + switch (SIMM) { + case 0: + gen_pxor(REG_V0_ID, REG_V0_ID); + goto commit; + case 1: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + gen_packuswb(REG_V0_ID, REG_V0_ID); + goto commit; + case 2: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + gen_psllw(x86_immediate_operand(1), REG_V0_ID); + gen_packuswb(REG_V0_ID, REG_V0_ID); + goto commit; + case 3: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(14), REG_V0_ID); + gen_packuswb(REG_V0_ID, REG_V0_ID); + goto commit; + case 4: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + gen_psllw(x86_immediate_operand(2), REG_V0_ID); + gen_packuswb(REG_V0_ID, REG_V0_ID); + goto commit; + case -1: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + goto commit; + case -2: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psllw(x86_immediate_operand(1), REG_V0_ID); + gen_packsswb(REG_V0_ID, REG_V0_ID); + goto commit; + { + commit: + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + break; + } + default: + const uint32 value = ((uint8)SIMM) * 0x01010101; + gen_mov_32(x86_immediate_operand(value), REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + break; } return true; -#endif - return false; } + +// vspltish +bool powerpc_jit::gen_sse2_vspltish(int mnemo, int vD, int SIMM) +{ + switch (SIMM) { + case 0: + gen_pxor(REG_V0_ID, REG_V0_ID); + goto commit; + case 1: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + goto commit; + case 2: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + gen_psllw(x86_immediate_operand(1), REG_V0_ID); + goto commit; + case 3: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(14), REG_V0_ID); + goto commit; + case 4: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psrlw(x86_immediate_operand(15), REG_V0_ID); + gen_psllw(x86_immediate_operand(2), REG_V0_ID); + goto commit; + case -1: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + goto commit; + case -2: + gen_pcmpeqw(REG_V0_ID, REG_V0_ID); + gen_psllw(x86_immediate_operand(1), REG_V0_ID); + goto commit; + { + commit: + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + break; + } + default: + const uint32 value = ((uint16)SIMM) * 0x10001; + gen_mov_32(x86_immediate_operand(value), REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + break; + } + return true; +} + +// vspltisw +bool powerpc_jit::gen_sse2_vspltisw(int mnemo, int vD, int SIMM) +{ + switch (SIMM) { + case 0: + gen_pxor(REG_V0_ID, REG_V0_ID); + goto commit; + case 1: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + gen_psrld(x86_immediate_operand(31), REG_V0_ID); + goto commit; + case 2: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + gen_psrld(x86_immediate_operand(31), REG_V0_ID); + gen_pslld(x86_immediate_operand(1), REG_V0_ID); + goto commit; + case 3: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + gen_psrld(x86_immediate_operand(30), REG_V0_ID); + goto commit; + case 4: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + gen_psrld(x86_immediate_operand(31), REG_V0_ID); + gen_pslld(x86_immediate_operand(2), REG_V0_ID); + goto commit; + case -1: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + goto commit; + case -2: + gen_pcmpeqd(REG_V0_ID, REG_V0_ID); + gen_pslld(x86_immediate_operand(1), REG_V0_ID); + goto commit; + { + commit: + gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID)); + break; + } + default: + const uint32 value = SIMM; + gen_mov_32(x86_immediate_operand(value), REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + } + return true; +} + +// vspltb +bool powerpc_jit::gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB) +{ + const int N = ev_mixed::byte_element(UIMM & 15); + gen_mov_zx_8_32(x86_memory_operand(xPPC_VR(vB) + N * 1, REG_CPU_ID), REG_T0_ID); + gen_imul_32(x86_immediate_operand(0x01010101), REG_T0_ID, REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + return true; +} + +// vsplth +bool powerpc_jit::gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB) +{ + const int N = ev_mixed::half_element(UIMM & 7); + gen_mov_zx_16_32(x86_memory_operand(xPPC_VR(vB) + N * 2, REG_CPU_ID), REG_T0_ID); + gen_imul_32(x86_immediate_operand(0x10001), REG_T0_ID, REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + return true; +} + +// vspltw +bool powerpc_jit::gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB) +{ + const int N = UIMM & 3; + gen_mov_32(x86_memory_operand(xPPC_VR(vB) + N * 4, REG_CPU_ID), REG_T0_ID); + gen_sse2_vsplat(vD, REG_T0_ID); + return true; +} +#endif diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp index b779b2dd..26df5d70 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-jit.hpp @@ -30,25 +30,68 @@ struct powerpc_jit // Default constructor powerpc_jit(dyngen_cpu_base cpu, int cache_size = -1); - bool gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc = false); - bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc = false); + bool gen_vector_1(int mnemo, int vD); + bool gen_vector_2(int mnemo, int vD, int vA, int vB); + bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC); + bool gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc); private: // Mid-level code generator info typedef bool (powerpc_jit::*gen_handler_t)(int, bool); + typedef void (powerpc_dyngen::*dyngen_handler_t)(void); + union jit_option_t { + jit_option_t() { } + uintptr value; + jit_option_t(uintptr v) : value(v) { } + dyngen_handler_t dyngen_handler; + jit_option_t(dyngen_handler_t const & h) : dyngen_handler(h) { } + }; struct jit_info_t { int mnemo; gen_handler_t handler; - powerpc_dyngen::gen_handler_t dyngen_handler; + jit_option_t o; }; static const jit_info_t *jit_info[]; private: - bool gen_not_available(int mnemo, bool Rc); - bool gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB); - bool gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC); - bool gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB); - bool gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH); + bool gen_not_available(int mnemo); + bool gen_vector_generic_1(int mnemo, int vD); + bool gen_vector_generic_2(int mnemo, int vD, int vA, int vB); + bool gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC); + bool gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc); + bool gen_vector_generic_load(int mnemo, int vD, int rA, int rB); + bool gen_vector_generic_store(int mnemo, int vS, int rA, int rB); + bool gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB); + bool gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB); + +#if defined(__i386__) || defined(__x86_64__) + bool gen_x86_lvx(int mnemo, int vD, int rA, int rB); + bool gen_x86_lvewx(int mnemo, int vD, int rA, int rB); + bool gen_x86_stvx(int mnemo, int vS, int rA, int rB); + bool gen_x86_stvewx(int mnemo, int vS, int rA, int rB); + bool gen_x86_mtvscr(int mnemo, int vD); + bool gen_x86_mfvscr(int mnemo, int vB); + bool gen_mmx_arith_2(int mnemo, int vD, int vA, int vB); + bool gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc); + void gen_sse_record_cr6(int vD); + bool gen_sse_arith_2(int mnemo, int vD, int vA, int vB); + bool gen_sse_arith_s(int mnemo, int vD, int vA, int vB); + bool gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc); + bool gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC); + bool gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC); + void gen_sse2_record_cr6(int vD); + bool gen_sse2_arith_2(int mnemo, int vD, int vA, int vB); + bool gen_sse2_arith_s(int mnemo, int vD, int vA, int vB); + bool gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc); + bool gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH); + void gen_sse2_vsplat(int vD, int rValue); + bool gen_sse2_vspltisb(int mnemo, int vD, int SIMM); + bool gen_sse2_vspltish(int mnemo, int vD, int SIMM); + bool gen_sse2_vspltisw(int mnemo, int vD, int SIMM); + bool gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB); + bool gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB); + bool gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB); +#endif }; #endif /* PPC_JIT_H */ diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp index 7826a904..32abf76e 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp @@ -1346,67 +1346,16 @@ powerpc_cpu::compile_block(uint32 entry_point) break; } #endif - // NOTE: A0/VD are clobbered in the following instructions! case PPC_I(LVEWX): case PPC_I(LVX): case PPC_I(LVXL): - { - const int rA = rA_field::extract(opcode); - const int rB = rB_field::extract(opcode); - const int vD = vD_field::extract(opcode); - dg.gen_load_T0_GPR(rB); - if (rA != 0) { - dg.gen_load_T1_GPR(rA); - dg.gen_add_32_T0_T1(); - } - switch (ii->mnemo) { - case PPC_I(LVEWX): dg.gen_load_word_VD_T0(vD); break; - case PPC_I(LVX): dg.gen_load_vect_VD_T0(vD); break; - case PPC_I(LVXL): dg.gen_load_vect_VD_T0(vD); break; - } - break; - } case PPC_I(STVEWX): case PPC_I(STVX): case PPC_I(STVXL): - { - const int rA = rA_field::extract(opcode); - const int rB = rB_field::extract(opcode); - const int vS = vS_field::extract(opcode); - dg.gen_load_T0_GPR(rB); - if (rA != 0) { - dg.gen_load_T1_GPR(rA); - dg.gen_add_32_T0_T1(); - } - switch (ii->mnemo) { - case PPC_I(STVEWX): dg.gen_store_word_VS_T0(vS); break; - case PPC_I(STVX): dg.gen_store_vect_VS_T0(vS); break; - case PPC_I(STVXL): dg.gen_store_vect_VS_T0(vS); break; - } - break; - } - case PPC_I(MFVSCR): - { - dg.gen_load_ad_VD_VR(vD_field::extract(opcode)); - dg.gen_mfvscr_VD(); - break; - } - case PPC_I(MTVSCR): - { - dg.gen_load_ad_V0_VR(vB_field::extract(opcode)); - dg.gen_mtvscr_V0(); - break; - } - case PPC_I(VSLDOI): - { - const int vD = vD_field::extract(opcode); - const int vA = vA_field::extract(opcode); - const int vB = vB_field::extract(opcode); - const int SH = vSH_field::extract(opcode); - if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH)) - goto do_generic; - break; - } + assert(vD_field::mask() == vS_field::mask()); + assert(vA_field::mask() == rA_field::mask()); + assert(vB_field::mask() == rB_field::mask()); + // fall-through case PPC_I(VCMPEQFP): case PPC_I(VCMPEQUB): case PPC_I(VCMPEQUH): @@ -1420,7 +1369,7 @@ powerpc_cpu::compile_block(uint32 entry_point) const int vD = vD_field::extract(opcode); const int vA = vA_field::extract(opcode); const int vB = vB_field::extract(opcode); - if (!dg.gen_vector_2(ii->mnemo, vD, vA, vB, vRc_field::test(opcode))) + if (!dg.gen_vector_compare(ii->mnemo, vD, vA, vB, vRc_field::test(opcode))) goto do_generic; break; } @@ -1443,6 +1392,8 @@ powerpc_cpu::compile_block(uint32 entry_point) case PPC_I(VSUBUHM): case PPC_I(VSUBUWM): case PPC_I(VXOR): + case PPC_I(VREFP): + case PPC_I(VRSQRTEFP): { const int vD = vD_field::extract(opcode); const int vA = vA_field::extract(opcode); @@ -1462,6 +1413,49 @@ powerpc_cpu::compile_block(uint32 entry_point) goto do_generic; break; } + case PPC_I(VSLDOI): + { + const int vD = vD_field::extract(opcode); + const int vA = vA_field::extract(opcode); + const int vB = vB_field::extract(opcode); + const int SH = vSH_field::extract(opcode); + if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH)) + goto do_generic; + break; + } + case PPC_I(MFVSCR): + { + if (!dg.gen_vector_1(ii->mnemo, vD_field::extract(opcode))) + goto do_generic; + break; + } + case PPC_I(MTVSCR): + { + if (!dg.gen_vector_1(ii->mnemo, vB_field::extract(opcode))) + goto do_generic; + break; + } + case PPC_I(VSPLTISB): + case PPC_I(VSPLTISH): + case PPC_I(VSPLTISW): + { + const int vD = vD_field::extract(opcode); + const int SIMM = op_sign_extend_5_32::apply(vUIMM_field::extract(opcode)); + if (!dg.gen_vector_2(ii->mnemo, vD, SIMM, 0)) + goto do_generic; + break; + } + case PPC_I(VSPLTB): + case PPC_I(VSPLTH): + case PPC_I(VSPLTW): + { + const int vD = vD_field::extract(opcode); + const int UIMM = vUIMM_field::extract(opcode); + const int vB = vB_field::extract(opcode); + if (!dg.gen_vector_2(ii->mnemo, vD, UIMM, vB)) + goto do_generic; + break; + } default: // Direct call to instruction handler { typedef void (*func_t)(dyngen_cpu_base, uint32);