Use new code generator. The gain is only 10%, bottlenecks are elsewhere.

Optimize Altivec vector splat instructions after Agner's guide.
This commit is contained in:
gbeauche 2006-07-17 06:56:38 +00:00
parent ceb43ce19a
commit e07e2196e3
10 changed files with 721 additions and 307 deletions

View File

@ -29,7 +29,7 @@ int __op_jmp0, __op_jmp1;
#include "basic-dyngen-ops.hpp"
basic_dyngen::basic_dyngen(dyngen_cpu_base cpu, int cache_size)
: parent_cpu(cpu), basic_jit_cache(cache_size)
: parent_cpu(cpu), jit_codegen(cache_size)
{
execute_func = gen_start();
gen_op_execute();

View File

@ -22,7 +22,7 @@
#define BASIC_DYNGEN_H
#include "cpu/jit/jit-config.hpp"
#include "cpu/jit/jit-cache.hpp"
#include "cpu/jit/jit-codegen.hpp"
// Set jump target address
static inline void dg_set_jmp_target(uint8 *jmp_addr, uint8 *addr)
@ -57,7 +57,7 @@ typedef basic_cpu *dyngen_cpu_base;
#endif
class basic_dyngen
: public basic_jit_cache
: public jit_codegen
{
uint8 *execute_func;
uint8 *gen_code_start;

View File

@ -54,8 +54,9 @@ protected:
// Initialize user code start
void set_code_start(uint8 *ptr);
// Get & increase current position
// Increase/set/get current position
void inc_code_ptr(int offset) { code_p += offset; }
void set_code_ptr(uint8 *ptr) { code_p = ptr; }
public:
uint8 *code_ptr() const { return code_p; }

View File

@ -374,6 +374,7 @@ private:
// Dynamic translation engine
friend class powerpc_dyngen_helper;
friend class powerpc_dyngen;
friend class powerpc_jit;
powerpc_jit codegen;
block_info *compile_block(uint32 entry);
#if DYNGEN_DIRECT_BLOCK_CHAINING

View File

@ -1709,140 +1709,6 @@ void op_mtvscr_V0(void)
#define __sse_clobbers(reglist...)
#endif
// SSE2 instructions
#define DEFINE_OP(NAME, OP, VA, VB) \
void op_sse2_##NAME(void) \
{ \
asm volatile ("movdqa (%1),%%xmm0\n" \
#OP " (%2),%%xmm0\n" \
"movdqa %%xmm0,(%0)\n" \
: : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) \
: __sse_clobbers("xmm0")); \
}
DEFINE_OP(vcmpequb, pcmpeqb, V0, V1);
DEFINE_OP(vcmpequh, pcmpeqw, V0, V1);
DEFINE_OP(vcmpequw, pcmpeqd, V0, V1);
DEFINE_OP(vcmpgtsb, pcmpgtb, V0, V1);
DEFINE_OP(vcmpgtsh, pcmpgtw, V0, V1);
DEFINE_OP(vcmpgtsw, pcmpgtd, V0, V1);
DEFINE_OP(vaddubm, paddb, V0, V1);
DEFINE_OP(vadduhm, paddw, V0, V1);
DEFINE_OP(vadduwm, paddd, V0, V1);
DEFINE_OP(vsububm, psubb, V0, V1);
DEFINE_OP(vsubuhm, psubw, V0, V1);
DEFINE_OP(vsubuwm, psubd, V0, V1);
DEFINE_OP(vand, pand, V0, V1);
DEFINE_OP(vandc, pandn, V1, V0);
DEFINE_OP(vor, por, V0, V1);
DEFINE_OP(vxor, pxor, V0, V1);
DEFINE_OP(vavgub, pavgb, V0, V1);
DEFINE_OP(vavguh, pavgw, V0, V1);
#undef DEFINE_OP
#define DEFINE_OP(SH) \
void op_sse2_vsldoi_##SH(void) \
{ \
asm volatile ("movdqa (%1),%%xmm0\n" \
"movdqa (%2),%%xmm1\n" \
"pshufd %3,%%xmm0,%%xmm0\n" \
"pshufd %3,%%xmm1,%%xmm1\n" \
"pslldq %4,%%xmm0\n" \
"psrldq %5,%%xmm1\n" \
"por %%xmm1,%%xmm0\n" \
"pshufd %3,%%xmm0,%%xmm0\n" \
"movdqa %%xmm0,(%0)\n" \
: : \
"r" (reg_VD), "r" (reg_V0), "r" (reg_V1), \
"i" (0x1b), "i" (SH), "i" (16 - SH) \
: __sse_clobbers("xmm0", "xmm1")); \
}
DEFINE_OP(1);
DEFINE_OP(2);
DEFINE_OP(3);
DEFINE_OP(4);
DEFINE_OP(5);
DEFINE_OP(6);
DEFINE_OP(7);
DEFINE_OP(8);
DEFINE_OP(9);
DEFINE_OP(10);
DEFINE_OP(11);
DEFINE_OP(12);
DEFINE_OP(13);
DEFINE_OP(14);
DEFINE_OP(15);
#undef DEFINE_OP
// SSE instructions
#define DEFINE_OP(NAME, OP, VA, VB) \
void op_sse_##NAME(void) \
{ \
asm volatile ("movaps (%1),%%xmm0\n" \
#OP " (%2),%%xmm0\n" \
"movaps %%xmm0,(%0)\n" \
: : "r" (reg_VD), "r" (reg_##VA), "r" (reg_##VB) \
: __sse_clobbers("xmm0")); \
}
DEFINE_OP(vcmpeqfp, cmpeqps, V0, V1);
DEFINE_OP(vcmpgefp, cmpleps, V1, V0);
DEFINE_OP(vcmpgtfp, cmpltps, V1, V0);
DEFINE_OP(vaddfp, addps, V0, V1);
DEFINE_OP(vsubfp, subps, V0, V1);
DEFINE_OP(vmaxfp, maxps, V0, V1);
DEFINE_OP(vminfp, minps, V0, V1);
DEFINE_OP(vand, andps, V0, V1);
DEFINE_OP(vandc, andnps, V1, V0);
DEFINE_OP(vor, orps, V0, V1);
DEFINE_OP(vxor, xorps, V0, V1);
DEFINE_OP(vminub, pminub, V0, V1);
DEFINE_OP(vmaxub, pmaxub, V0, V1);
DEFINE_OP(vminsh, pminsw, V0, V1);
DEFINE_OP(vmaxsh, pmaxsw, V0, V1);
#undef DEFINE_OP
void op_sse_vmaddfp(void)
{
asm volatile ("movaps (%1),%%xmm0\n"
"mulps (%3),%%xmm0\n"
"addps (%2),%%xmm0\n"
"movaps %%xmm0,(%0)\n"
: : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2)
: __sse_clobbers("xmm0"));
}
void op_sse_vnmsubfp(void)
{
asm volatile ("movaps (%1),%%xmm0\n"
"xorps %%xmm1,%%xmm1\n"
"mulps (%3),%%xmm0\n"
"subps (%2),%%xmm0\n"
"subps %%xmm0,%%xmm1\n"
"movaps %%xmm1,(%0)\n"
: : "r" (reg_VD), "r" (reg_V0), "r" (reg_V1), "r" (reg_V2)
: __sse_clobbers("xmm0", "xmm1"));
}
#define DEFINE_OP(VD, VS) \
void op_sse_mov_##VD##_##VS(void) \
{ \
asm volatile ("movaps (%1),%%xmm0\n" \
"movaps %%xmm0,(%0)\n" \
: : "r" (reg_##VD), "r" (reg_##VS) \
: __sse_clobbers("xmm0")); \
}
DEFINE_OP(VD, V0);
DEFINE_OP(VD, V1);
DEFINE_OP(VD, V2);
#undef DEFINE_OP
// MMX instructions
void op_emms(void)
{

View File

@ -307,29 +307,3 @@ void powerpc_dyngen::gen_store_vect_VS_T0(int vS)
gen_load_ad_VD_VR(vS);
gen_op_store_vect_VD_T0();
}
void powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(int SH)
{
#if defined(__i386__) || defined(__x86_64__)
switch (SH) {
#define GEN_OP(SH) case SH: gen_op_sse2_vsldoi_##SH(); break
GEN_OP(1);
GEN_OP(2);
GEN_OP(3);
GEN_OP(4);
GEN_OP(5);
GEN_OP(6);
GEN_OP(7);
GEN_OP(8);
GEN_OP(9);
GEN_OP(10);
GEN_OP(11);
GEN_OP(12);
GEN_OP(13);
GEN_OP(14);
GEN_OP(15);
#undef GEN_OP
default: abort();
}
#endif
}

View File

@ -234,7 +234,6 @@ public:
DEFINE_ALIAS(record_cr6_VD,0);
DEFINE_ALIAS(mfvscr_VD,0);
DEFINE_ALIAS(mtvscr_V0,0);
void gen_sse2_vsldoi_VD_V0_V1(int SH);
#undef DEFINE_ALIAS
#undef DEFINE_ALIAS_0

View File

@ -19,8 +19,11 @@
*/
#include "sysdeps.h"
#include "cpu/jit/dyngen-exec.h"
#include "cpu/ppc/ppc-jit.hpp"
#include "cpu/ppc/ppc-cpu.hpp"
#include "cpu/ppc/ppc-instructions.hpp"
#include "cpu/ppc/ppc-operands.hpp"
#include "utils/utils-cpuinfo.hpp"
#include "utils/utils-sentinel.hpp"
@ -40,7 +43,6 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
static const jit_info_t jit_not_available = {
-1,
(gen_handler_t)&powerpc_jit::gen_not_available,
0
};
for (int i = 0; i < PPC_I(MAX); i++)
jit_info[i] = &jit_not_available;
@ -57,28 +59,54 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
DEFINE_OP(VANDC, 2, vandc_VD_V0_V1),
DEFINE_OP(VNOR, 2, vnor_VD_V0_V1),
DEFINE_OP(VOR, 2, vor_VD_V0_V1),
DEFINE_OP(VXOR, 2, vxor_VD_V0_V1)
DEFINE_OP(VXOR, 2, vxor_VD_V0_V1),
DEFINE_OP(MFVSCR, 1, mfvscr_VD),
DEFINE_OP(MTVSCR, 1, mtvscr_V0),
#undef DEFINE_OP
#define DEFINE_OP(MNEMO, GEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, }
DEFINE_OP(LVX, load),
DEFINE_OP(LVXL, load),
DEFINE_OP(LVEWX, load_word),
DEFINE_OP(STVX, store),
DEFINE_OP(STVXL, store),
DEFINE_OP(STVEWX, store_word),
#undef DEFINE_OP
};
for (int i = 0; i < sizeof(gen_vector) / sizeof(gen_vector[0]); i++)
jit_info[gen_vector[i].mnemo] = &gen_vector[i];
#if defined(__i386__) || defined(__x86_64__)
// x86 optimized handlers
static const jit_info_t x86_vector[] = {
#define DEFINE_OP(MNEMO, GEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_x86_##GEN_OP, }
DEFINE_OP(MTVSCR, mtvscr),
DEFINE_OP(MFVSCR, mfvscr),
DEFINE_OP(LVX, lvx),
DEFINE_OP(LVXL, lvx),
DEFINE_OP(STVX, stvx),
DEFINE_OP(STVXL, stvx)
#undef DEFINE_OP
};
for (int i = 0; i < sizeof(x86_vector) / sizeof(x86_vector[0]); i++)
jit_info[x86_vector[i].mnemo] = &x86_vector[i];
// MMX optimized handlers
static const jit_info_t mmx_vector[] = {
#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
DEFINE_OP(VADDUBM, 2, vaddubm),
DEFINE_OP(VADDUHM, 2, vadduhm),
DEFINE_OP(VADDUWM, 2, vadduwm),
DEFINE_OP(VAND, 2, vand),
DEFINE_OP(VANDC, 2, vandc),
DEFINE_OP(VCMPEQUB, 2, vcmpequb),
DEFINE_OP(VCMPEQUH, 2, vcmpequh),
DEFINE_OP(VCMPEQUW, 2, vcmpequw),
DEFINE_OP(VCMPGTSB, 2, vcmpgtsb),
DEFINE_OP(VCMPGTSH, 2, vcmpgtsh),
DEFINE_OP(VCMPGTSW, 2, vcmpgtsw),
DEFINE_OP(VCMPEQUB, c, vcmpequb),
DEFINE_OP(VCMPEQUH, c, vcmpequh),
DEFINE_OP(VCMPEQUW, c, vcmpequw),
DEFINE_OP(VCMPGTSB, c, vcmpgtsb),
DEFINE_OP(VCMPGTSH, c, vcmpgtsh),
DEFINE_OP(VCMPGTSW, c, vcmpgtsw),
DEFINE_OP(VOR, 2, vor),
DEFINE_OP(VSUBUBM, 2, vsububm),
DEFINE_OP(VSUBUHM, 2, vsubuhm),
@ -95,32 +123,38 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
static const jit_info_t sse_vector[] = {
// new MMX instructions brought into SSE capable CPUs
#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_mmx_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_mmx_arith_##GEN_OP, &powerpc_dyngen::gen_op_mmx_##DYNGEN_OP }
DEFINE_OP(VMAXSH, 2, vmaxsh),
DEFINE_OP(VMAXUB, 2, vmaxub),
DEFINE_OP(VMINSH, 2, vminsh),
DEFINE_OP(VMINUB, 2, vminub),
#undef DEFINE_OP
// full SSE instructions
#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse_##DYNGEN_OP }
DEFINE_OP(VADDFP, 2, vaddfp),
DEFINE_OP(VAND, 2, vand),
DEFINE_OP(VANDC, 2, vandc),
DEFINE_OP(VCMPEQFP, 2, vcmpeqfp),
DEFINE_OP(VCMPGEFP, 2, vcmpgefp),
DEFINE_OP(VCMPGTFP, 2, vcmpgtfp),
DEFINE_OP(VMADDFP, 3, vmaddfp),
DEFINE_OP(VMAXFP, 2, vmaxfp),
DEFINE_OP(VMINFP, 2, vminfp),
DEFINE_OP(VNMSUBFP, 3, vnmsubfp),
DEFINE_OP(VOR, 2, vor),
DEFINE_OP(VSUBFP, 2, vsubfp),
DEFINE_OP(VXOR, 2, vxor),
DEFINE_OP(VMINUB, 2, vminub),
DEFINE_OP(VMAXUB, 2, vmaxub),
DEFINE_OP(VMINSH, 2, vminsh),
DEFINE_OP(VMAXSH, 2, vmaxsh)
#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP }
DEFINE_OP(VADDFP, 2, PS,ADD),
DEFINE_OP(VAND, 2, PS,AND),
DEFINE_OP(VANDC, s, PS,ANDN),
DEFINE_OP(VMAXFP, 2, PS,MAX),
DEFINE_OP(VMINFP, 2, PS,MIN),
DEFINE_OP(VOR, 2, PS,OR),
DEFINE_OP(VSUBFP, 2, PS,SUB),
DEFINE_OP(VXOR, 2, PS,XOR),
DEFINE_OP(VMINUB, 2, PI,PMINUB),
DEFINE_OP(VMAXUB, 2, PI,PMAXUB),
DEFINE_OP(VMINSH, 2, PI,PMINSW),
DEFINE_OP(VMAXSH, 2, PI,PMAXSW),
#undef DEFINE_OP
#define DEFINE_OP(MNEMO, COND) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_arith_c, X86_SSE_CC_##COND }
DEFINE_OP(VCMPEQFP, EQ),
DEFINE_OP(VCMPGEFP, GE),
DEFINE_OP(VCMPGTFP, GT),
#undef DEFINE_OP
#define DEFINE_OP(MNEMO, GEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse_##GEN_OP }
DEFINE_OP(VMADDFP, vmaddfp),
DEFINE_OP(VNMSUBFP, vnmsubfp)
#undef DEFINE_OP
};
@ -129,29 +163,39 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
jit_info[sse_vector[i].mnemo] = &sse_vector[i];
}
// generic altivec handlers
// SSE2 optimized handlers
static const jit_info_t sse2_vector[] = {
#define DEFINE_OP(MNEMO, GEN_OP, DYNGEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_vector_generic_##GEN_OP, &powerpc_dyngen::gen_op_sse2_##DYNGEN_OP }
DEFINE_OP(VADDUBM, 2, vaddubm),
DEFINE_OP(VADDUHM, 2, vadduhm),
DEFINE_OP(VADDUWM, 2, vadduwm),
DEFINE_OP(VSUBUBM, 2, vsububm),
DEFINE_OP(VSUBUHM, 2, vsubuhm),
DEFINE_OP(VSUBUWM, 2, vsubuwm),
DEFINE_OP(VAND, 2, vand),
DEFINE_OP(VANDC, 2, vandc),
DEFINE_OP(VOR, 2, vor),
DEFINE_OP(VXOR, 2, vxor),
DEFINE_OP(VCMPEQUB, 2, vcmpequb),
DEFINE_OP(VCMPEQUH, 2, vcmpequh),
DEFINE_OP(VCMPEQUW, 2, vcmpequw),
DEFINE_OP(VCMPGTSB, 2, vcmpgtsb),
DEFINE_OP(VCMPGTSH, 2, vcmpgtsh),
DEFINE_OP(VCMPGTSW, 2, vcmpgtsw),
#define DEFINE_OP(MNEMO, GEN_OP, TYPE_OP, SSE_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_arith_##GEN_OP, (X86_INSN_SSE_##TYPE_OP << 8) | X86_SSE_##SSE_OP }
DEFINE_OP(VADDUBM, 2, PI,PADDB),
DEFINE_OP(VADDUHM, 2, PI,PADDW),
DEFINE_OP(VADDUWM, 2, PI,PADDD),
DEFINE_OP(VSUBUBM, 2, PI,PSUBB),
DEFINE_OP(VSUBUHM, 2, PI,PSUBW),
DEFINE_OP(VSUBUWM, 2, PI,PSUBD),
DEFINE_OP(VAND, 2, PI,PAND),
DEFINE_OP(VANDC, s, PI,PANDN),
DEFINE_OP(VOR, 2, PI,POR),
DEFINE_OP(VXOR, 2, PI,PXOR),
DEFINE_OP(VCMPEQUB, c, PI,PCMPEQB),
DEFINE_OP(VCMPEQUH, c, PI,PCMPEQW),
DEFINE_OP(VCMPEQUW, c, PI,PCMPEQD),
DEFINE_OP(VCMPGTSB, c, PI,PCMPGTB),
DEFINE_OP(VCMPGTSH, c, PI,PCMPGTW),
DEFINE_OP(VCMPGTSW, c, PI,PCMPGTD),
DEFINE_OP(VREFP, 2, PS,RCP),
DEFINE_OP(VRSQRTEFP,2, PS,RSQRT),
#undef DEFINE_OP
#define DEFINE_OP(MNEMO, GEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_sse2_##GEN_OP, }
DEFINE_OP(VSLDOI, vsldoi),
DEFINE_OP(VSPLTB, vspltb),
DEFINE_OP(VSPLTH, vsplth),
DEFINE_OP(VSPLTW, vspltw),
DEFINE_OP(VSPLTISB, vspltisb),
DEFINE_OP(VSPLTISH, vspltish),
DEFINE_OP(VSPLTISW, vspltisw)
#undef DEFINE_OP
{ PPC_I(VSLDOI),
(gen_handler_t)&powerpc_jit::gen_sse2_vsldoi, 0 }
};
if (cpuinfo_check_sse2()) {
@ -163,72 +207,564 @@ powerpc_jit::powerpc_jit(dyngen_cpu_base cpu, int cache_size)
}
// Dispatch mid-level code generators
bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc)
bool powerpc_jit::gen_vector_1(int mnemo, int vD)
{
return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB);
return (this->*((bool (powerpc_jit::*)(int, int))jit_info[mnemo]->handler))(mnemo, vD);
}
bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc)
bool powerpc_jit::gen_vector_2(int mnemo, int vD, int vA, int vB)
{
return (this->*((bool (powerpc_jit::*)(int, bool, int, int, int, int))jit_info[mnemo]->handler))(mnemo, Rc, vD, vA, vB, vC);
return (this->*((bool (powerpc_jit::*)(int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB);
}
bool powerpc_jit::gen_vector_3(int mnemo, int vD, int vA, int vB, int vC)
{
return (this->*((bool (powerpc_jit::*)(int, int, int, int, int))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, vC);
}
bool powerpc_jit::gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc)
{
return (this->*((bool (powerpc_jit::*)(int, int, int, int, bool))jit_info[mnemo]->handler))(mnemo, vD, vA, vB, Rc);
}
bool powerpc_jit::gen_not_available(int mnemo, bool Rc)
bool powerpc_jit::gen_not_available(int mnemo)
{
return false;
}
bool powerpc_jit::gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB)
bool powerpc_jit::gen_vector_generic_1(int mnemo, int vD)
{
gen_load_ad_VD_VR(vD);
(this->*(jit_info[mnemo]->o.dyngen_handler))();
return true;
}
bool powerpc_jit::gen_vector_generic_2(int mnemo, int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
jit_info[mnemo]->dyngen_handler(this);
if (Rc)
gen_record_cr6_VD();
(this->*(jit_info[mnemo]->o.dyngen_handler))();
return true;
}
bool powerpc_jit::gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC)
bool powerpc_jit::gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_load_ad_V2_VR(vC);
jit_info[mnemo]->dyngen_handler(this);
(this->*(jit_info[mnemo]->o.dyngen_handler))();
return true;
}
bool powerpc_jit::gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc)
{
gen_vector_generic_2(mnemo, vD, vA, vB);
if (Rc)
gen_record_cr6_VD();
return true;
}
bool powerpc_jit::gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB)
bool powerpc_jit::gen_vector_generic_load(int mnemo, int vD, int rA, int rB)
{
// NOTE: T0/VD are clobbered in the following instructions!
gen_load_T0_GPR(rB);
if (rA != 0) {
gen_load_T1_GPR(rA);
gen_add_32_T0_T1();
}
gen_load_vect_VD_T0(vD);
return true;
}
bool powerpc_jit::gen_vector_generic_store(int mnemo, int vS, int rA, int rB)
{
// NOTE: T0/VS are clobbered in the following instructions!
gen_load_T0_GPR(rB);
if (rA != 0) {
gen_load_T1_GPR(rA);
gen_add_32_T0_T1();
}
gen_store_vect_VS_T0(vS);
return true;
}
bool powerpc_jit::gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB)
{
// NOTE: T0/VD are clobbered in the following instructions!
gen_load_T0_GPR(rB);
if (rA != 0) {
gen_load_T1_GPR(rA);
gen_add_32_T0_T1();
}
gen_load_word_VD_T0(vD);
return true;
}
bool powerpc_jit::gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB)
{
// NOTE: T0/VS are clobbered in the following instructions!
gen_load_T0_GPR(rB);
if (rA != 0) {
gen_load_T1_GPR(rA);
gen_add_32_T0_T1();
}
gen_store_word_VS_T0(vS);
return true;
}
#define xPPC_FIELD(M) ((uintptr)&((powerpc_cpu *)0)->M)
#define xPPC_GPR(N) xPPC_FIELD(gpr(N))
#define xPPC_VR(N) xPPC_FIELD(vr(N))
#define xPPC_CR xPPC_FIELD(cr())
#define xPPC_VSCR xPPC_FIELD(vscr())
#if defined(__i386__) || defined(__x86_64__)
/*
* X86 optimizations
*/
// mtvscr
bool powerpc_jit::gen_x86_mtvscr(int mnemo, int vD)
{
gen_mov_32(x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID), REG_T0_ID);
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VSCR, REG_CPU_ID));
return true;
}
// mfvscr
bool powerpc_jit::gen_x86_mfvscr(int mnemo, int vB)
{
gen_xor_32(REG_T0_ID, REG_T0_ID);
gen_mov_32(x86_memory_operand(xPPC_VSCR, REG_CPU_ID), REG_T1_ID);
#if SIZEOF_VOID_P == 8
gen_mov_64(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID));
#else
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 0*4, REG_CPU_ID));
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 1*4, REG_CPU_ID));
#endif
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_VR(vB) + 2*4, REG_CPU_ID));
gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vB) + 3*4, REG_CPU_ID));
return true;
}
// lvx, lvxl
bool powerpc_jit::gen_x86_lvx(int mnemo, int vD, int rA, int rB)
{
gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
if (rA != 0)
gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
#if SIZEOF_VOID_P == 8
gen_mov_64(x86_memory_operand(0, REG_T0_ID), REG_T1_ID);
gen_mov_64(x86_memory_operand(8, REG_T0_ID), REG_T2_ID);
gen_bswap_64(REG_T1_ID);
gen_bswap_64(REG_T2_ID);
gen_rol_64(x86_immediate_operand(32), REG_T1_ID);
gen_rol_64(x86_immediate_operand(32), REG_T2_ID);
gen_mov_64(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0, REG_CPU_ID));
gen_mov_64(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 8, REG_CPU_ID));
#else
gen_mov_32(x86_memory_operand(0*4, REG_T0_ID), REG_T1_ID);
gen_mov_32(x86_memory_operand(1*4, REG_T0_ID), REG_T2_ID);
gen_bswap_32(REG_T1_ID);
gen_bswap_32(REG_T2_ID);
gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 0*4, REG_CPU_ID));
gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 1*4, REG_CPU_ID));
gen_mov_32(x86_memory_operand(2*4, REG_T0_ID), REG_T1_ID);
gen_mov_32(x86_memory_operand(3*4, REG_T0_ID), REG_T2_ID);
gen_bswap_32(REG_T1_ID);
gen_bswap_32(REG_T2_ID);
gen_mov_32(REG_T1_ID, x86_memory_operand(xPPC_VR(vD) + 2*4, REG_CPU_ID));
gen_mov_32(REG_T2_ID, x86_memory_operand(xPPC_VR(vD) + 3*4, REG_CPU_ID));
#endif
return true;
}
// stvx, stvxl
bool powerpc_jit::gen_x86_stvx(int mnemo, int vS, int rA, int rB)
{
// NOTE: primitive scheduling
gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
#if SIZEOF_VOID_P == 8
gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 0, REG_CPU_ID), REG_T1_ID);
gen_mov_64(x86_memory_operand(xPPC_VR(vS) + 8, REG_CPU_ID), REG_T2_ID);
if (rA != 0)
gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
gen_bswap_64(REG_T1_ID);
gen_bswap_64(REG_T2_ID);
gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
gen_rol_64(x86_immediate_operand(32), REG_T1_ID);
gen_rol_64(x86_immediate_operand(32), REG_T2_ID);
gen_mov_64(REG_T1_ID, x86_memory_operand(0, REG_T0_ID));
gen_mov_64(REG_T2_ID, x86_memory_operand(8, REG_T0_ID));
#else
gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 0*4, REG_CPU_ID), REG_T1_ID);
gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 1*4, REG_CPU_ID), REG_T2_ID);
if (rA != 0)
gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
gen_bswap_32(REG_T1_ID);
gen_bswap_32(REG_T2_ID);
gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
gen_mov_32(REG_T1_ID, x86_memory_operand(0*4, REG_T0_ID));
gen_mov_32(REG_T2_ID, x86_memory_operand(1*4, REG_T0_ID));
gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 2*4, REG_CPU_ID), REG_T1_ID);
gen_mov_32(x86_memory_operand(xPPC_VR(vS) + 3*4, REG_CPU_ID), REG_T2_ID);
gen_bswap_32(REG_T1_ID);
gen_bswap_32(REG_T2_ID);
gen_mov_32(REG_T1_ID, x86_memory_operand(2*4, REG_T0_ID));
gen_mov_32(REG_T2_ID, x86_memory_operand(3*4, REG_T0_ID));
#endif
return true;
}
/*
* MMX optimizations
*/
// Generic MMX arith
bool powerpc_jit::gen_mmx_arith_2(int mnemo, int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
jit_info[mnemo]->dyngen_handler(this);
if (Rc)
gen_record_cr6_VD();
(this->*(jit_info[mnemo]->o.dyngen_handler))();
gen_op_emms();
return true;
#endif
return false;
}
bool powerpc_jit::gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH)
// MMX comparison
bool powerpc_jit::gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
{
#if defined(__i386__) || defined(__x86_64__)
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
if (SH == 0)
gen_op_sse_mov_VD_V0();
else {
gen_load_ad_V1_VR(vB);
powerpc_dyngen::gen_sse2_vsldoi_VD_V0_V1(SH);
gen_mmx_arith_2(mnemo, vD, vA, vB);
if (Rc)
gen_record_cr6_VD();
return true;
}
/*
* SSE optimizations
*/
// Record CR6 (vD contains the result of the CMP instruction)
void powerpc_jit::gen_sse_record_cr6(int vD)
{
gen_xor_32(REG_T0_ID, REG_T0_ID); // xor %t0,%t0
gen_xor_32(REG_T1_ID, REG_T1_ID); // xor %t1,%t1
gen_insn(X86_INSN_SSE_PS, X86_SSE_MOVMSK, vD, REG_T2_ID); // movmskps %v0,%t2
gen_cmp_32(x86_immediate_operand(0), REG_T2_ID); // cmp $0,%t2
gen_setcc(X86_CC_Z, REG_T0_ID); // sete %t0
gen_cmp_32(x86_immediate_operand(0xf), REG_T2_ID); // cmp $0xf,%t1
gen_setcc(X86_CC_E, REG_T1_ID); // sete %t1
gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID); // %t2 = %t0 + %t1*4
gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID); // mov $xPPC_CR(%cpu),%t0
gen_shl_32(x86_immediate_operand(5), REG_T2_ID); // %t2 holds new cr6
gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID); // and $0xffffff0f,%t0
gen_or_32(REG_T2_ID, REG_T0_ID); // or %t2,%t0
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID)); // mov %t0,$xPPC_CR(%cpu)
}
// Generic SSE arith
bool powerpc_jit::gen_sse_arith_2(int mnemo, int vD, int vA, int vB)
{
gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
const uint16 insn = jit_info[mnemo]->o.value;
gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
// Generic SSE arith with swapped operands (ANDPS)
bool powerpc_jit::gen_sse_arith_s(int mnemo, int vD, int vA, int vB)
{
return gen_sse_arith_2(mnemo, vD, vB, vA);
}
// SSE comparison (CMPPS)
bool powerpc_jit::gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
{
// NOTE: this uses swapped operands for GT, GE (no change for EQ)
gen_movaps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
gen_cmpps(jit_info[mnemo]->o.value, x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
if (Rc)
gen_sse_record_cr6(REG_V0_ID);
return true;
}
// vmaddfp
bool powerpc_jit::gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC)
{
gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID);
gen_addps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
gen_movaps(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
// vnmsubfp
bool powerpc_jit::gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC)
{
gen_movaps(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
gen_xorps(REG_V1_ID, REG_V1_ID);
gen_mulps(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V0_ID);
gen_subps(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
gen_subps(REG_V0_ID, REG_V1_ID);
gen_movaps(REG_V1_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
/*
* SSE2 optimizations
*/
// Record CR6 (vD contains the result of the CMP instruction)
void powerpc_jit::gen_sse2_record_cr6(int vD)
{
gen_xor_32(REG_T0_ID, REG_T0_ID); // xor %t0,%t0
gen_xor_32(REG_T1_ID, REG_T1_ID); // xor %t1,%t1
gen_pmovmskb(vD, REG_T2_ID); // pmovmskb %v0,%t2
gen_cmp_32(x86_immediate_operand(0), REG_T2_ID); // cmp $0,%t2
gen_setcc(X86_CC_Z, REG_T0_ID); // sete %t0
gen_cmp_32(x86_immediate_operand(0xffff), REG_T2_ID); // cmp $0xffff,%t1
gen_setcc(X86_CC_E, REG_T1_ID); // sete %t1
gen_lea_32(x86_memory_operand(0, REG_T0_ID, REG_T1_ID, 4), REG_T2_ID); // %t2 = %t0 + %t1*4
gen_mov_32(x86_memory_operand(xPPC_CR, REG_CPU_ID), REG_T0_ID); // mov $xPPC_CR(%cpu),%t0
gen_shl_32(x86_immediate_operand(5), REG_T2_ID); // %t2 holds new cr6
gen_and_32(x86_immediate_operand(0xffffff0f), REG_T0_ID); // and $0xffffff0f,%t0
gen_or_32(REG_T2_ID, REG_T0_ID); // or %t2,%t0
gen_mov_32(REG_T0_ID, x86_memory_operand(xPPC_CR, REG_CPU_ID)); // mov %t0,$xPPC_CR(%cpu)
}
// Generic SSE2 arith
bool powerpc_jit::gen_sse2_arith_2(int mnemo, int vD, int vA, int vB)
{
gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
const uint16 insn = jit_info[mnemo]->o.value;
gen_insn(insn >> 8, insn & 0xff, x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V0_ID);
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
// Generic SSE2 arith with swapped operands (PANDN)
bool powerpc_jit::gen_sse2_arith_s(int mnemo, int vD, int vA, int vB)
{
return gen_sse2_arith_2(mnemo, vD, vB, vA);
}
// SSE2 comparison (PCMPEQ, PCMPGT)
bool powerpc_jit::gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc)
{
gen_sse2_arith_2(mnemo, vD, vA, vB);
if (Rc)
gen_sse2_record_cr6(REG_V0_ID);
return true;
}
// vsldoi
bool powerpc_jit::gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH)
{
// Optimize out vsldoi vX,vX,vB,0
if (SH == 0 && vA == vD)
return true;
gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
if (SH) {
gen_movdqa(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V1_ID);
gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID);
gen_pshufd(x86_immediate_operand(0x1b), REG_V1_ID, REG_V1_ID);
gen_pslldq(x86_immediate_operand(SH), REG_V0_ID);
gen_psrldq(x86_immediate_operand(16 - SH), REG_V1_ID);
gen_por(REG_V1_ID, REG_V0_ID);
gen_pshufd(x86_immediate_operand(0x1b), REG_V0_ID, REG_V0_ID);
}
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
/*
* Vector splat instructions
*
* Reference: "Optimizing subroutines in assembly language", Agner, table 13.6
*/
void powerpc_jit::gen_sse2_vsplat(int vD, int rValue)
{
gen_movd_lx(rValue, REG_V0_ID);
gen_pshufd(x86_immediate_operand(0), REG_V0_ID, REG_V0_ID);
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
}
// vspltisb
bool powerpc_jit::gen_sse2_vspltisb(int mnemo, int vD, int SIMM)
{
switch (SIMM) {
case 0:
gen_pxor(REG_V0_ID, REG_V0_ID);
goto commit;
case 1:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
gen_packuswb(REG_V0_ID, REG_V0_ID);
goto commit;
case 2:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
gen_psllw(x86_immediate_operand(1), REG_V0_ID);
gen_packuswb(REG_V0_ID, REG_V0_ID);
goto commit;
case 3:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(14), REG_V0_ID);
gen_packuswb(REG_V0_ID, REG_V0_ID);
goto commit;
case 4:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
gen_psllw(x86_immediate_operand(2), REG_V0_ID);
gen_packuswb(REG_V0_ID, REG_V0_ID);
goto commit;
case -1:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
goto commit;
case -2:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psllw(x86_immediate_operand(1), REG_V0_ID);
gen_packsswb(REG_V0_ID, REG_V0_ID);
goto commit;
{
commit:
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
break;
}
default:
const uint32 value = ((uint8)SIMM) * 0x01010101;
gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
break;
}
return true;
#endif
return false;
}
// vspltish
bool powerpc_jit::gen_sse2_vspltish(int mnemo, int vD, int SIMM)
{
switch (SIMM) {
case 0:
gen_pxor(REG_V0_ID, REG_V0_ID);
goto commit;
case 1:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
goto commit;
case 2:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
gen_psllw(x86_immediate_operand(1), REG_V0_ID);
goto commit;
case 3:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(14), REG_V0_ID);
goto commit;
case 4:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psrlw(x86_immediate_operand(15), REG_V0_ID);
gen_psllw(x86_immediate_operand(2), REG_V0_ID);
goto commit;
case -1:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
goto commit;
case -2:
gen_pcmpeqw(REG_V0_ID, REG_V0_ID);
gen_psllw(x86_immediate_operand(1), REG_V0_ID);
goto commit;
{
commit:
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
break;
}
default:
const uint32 value = ((uint16)SIMM) * 0x10001;
gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
break;
}
return true;
}
// vspltisw
bool powerpc_jit::gen_sse2_vspltisw(int mnemo, int vD, int SIMM)
{
switch (SIMM) {
case 0:
gen_pxor(REG_V0_ID, REG_V0_ID);
goto commit;
case 1:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
gen_psrld(x86_immediate_operand(31), REG_V0_ID);
goto commit;
case 2:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
gen_psrld(x86_immediate_operand(31), REG_V0_ID);
gen_pslld(x86_immediate_operand(1), REG_V0_ID);
goto commit;
case 3:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
gen_psrld(x86_immediate_operand(30), REG_V0_ID);
goto commit;
case 4:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
gen_psrld(x86_immediate_operand(31), REG_V0_ID);
gen_pslld(x86_immediate_operand(2), REG_V0_ID);
goto commit;
case -1:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
goto commit;
case -2:
gen_pcmpeqd(REG_V0_ID, REG_V0_ID);
gen_pslld(x86_immediate_operand(1), REG_V0_ID);
goto commit;
{
commit:
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
break;
}
default:
const uint32 value = SIMM;
gen_mov_32(x86_immediate_operand(value), REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
}
return true;
}
// vspltb
bool powerpc_jit::gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB)
{
const int N = ev_mixed::byte_element(UIMM & 15);
gen_mov_zx_8_32(x86_memory_operand(xPPC_VR(vB) + N * 1, REG_CPU_ID), REG_T0_ID);
gen_imul_32(x86_immediate_operand(0x01010101), REG_T0_ID, REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
return true;
}
// vsplth
bool powerpc_jit::gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB)
{
const int N = ev_mixed::half_element(UIMM & 7);
gen_mov_zx_16_32(x86_memory_operand(xPPC_VR(vB) + N * 2, REG_CPU_ID), REG_T0_ID);
gen_imul_32(x86_immediate_operand(0x10001), REG_T0_ID, REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
return true;
}
// vspltw
bool powerpc_jit::gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB)
{
const int N = UIMM & 3;
gen_mov_32(x86_memory_operand(xPPC_VR(vB) + N * 4, REG_CPU_ID), REG_T0_ID);
gen_sse2_vsplat(vD, REG_T0_ID);
return true;
}
#endif

View File

@ -30,25 +30,68 @@ struct powerpc_jit
// Default constructor
powerpc_jit(dyngen_cpu_base cpu, int cache_size = -1);
bool gen_vector_2(int mnemo, int vD, int vA, int vB, bool Rc = false);
bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC, bool Rc = false);
bool gen_vector_1(int mnemo, int vD);
bool gen_vector_2(int mnemo, int vD, int vA, int vB);
bool gen_vector_3(int mnemo, int vD, int vA, int vB, int vC);
bool gen_vector_compare(int mnemo, int vD, int vA, int vB, bool Rc);
private:
// Mid-level code generator info
typedef bool (powerpc_jit::*gen_handler_t)(int, bool);
typedef void (powerpc_dyngen::*dyngen_handler_t)(void);
union jit_option_t {
jit_option_t() { }
uintptr value;
jit_option_t(uintptr v) : value(v) { }
dyngen_handler_t dyngen_handler;
jit_option_t(dyngen_handler_t const & h) : dyngen_handler(h) { }
};
struct jit_info_t {
int mnemo;
gen_handler_t handler;
powerpc_dyngen::gen_handler_t dyngen_handler;
jit_option_t o;
};
static const jit_info_t *jit_info[];
private:
bool gen_not_available(int mnemo, bool Rc);
bool gen_vector_generic_2(int mnemo, bool Rc, int vD, int vA, int vB);
bool gen_vector_generic_3(int mnemo, bool Rc, int vD, int vA, int vB, int vC);
bool gen_vector_mmx_2(int mnemo, bool Rc, int vD, int vA, int vB);
bool gen_sse2_vsldoi(int mnemo, bool Rc, int vD, int vA, int vB, int SH);
bool gen_not_available(int mnemo);
bool gen_vector_generic_1(int mnemo, int vD);
bool gen_vector_generic_2(int mnemo, int vD, int vA, int vB);
bool gen_vector_generic_3(int mnemo, int vD, int vA, int vB, int vC);
bool gen_vector_generic_c(int mnemo, int vD, int vA, int vB, bool Rc);
bool gen_vector_generic_load(int mnemo, int vD, int rA, int rB);
bool gen_vector_generic_store(int mnemo, int vS, int rA, int rB);
bool gen_vector_generic_load_word(int mnemo, int vD, int rA, int rB);
bool gen_vector_generic_store_word(int mnemo, int vS, int rA, int rB);
#if defined(__i386__) || defined(__x86_64__)
bool gen_x86_lvx(int mnemo, int vD, int rA, int rB);
bool gen_x86_lvewx(int mnemo, int vD, int rA, int rB);
bool gen_x86_stvx(int mnemo, int vS, int rA, int rB);
bool gen_x86_stvewx(int mnemo, int vS, int rA, int rB);
bool gen_x86_mtvscr(int mnemo, int vD);
bool gen_x86_mfvscr(int mnemo, int vB);
bool gen_mmx_arith_2(int mnemo, int vD, int vA, int vB);
bool gen_mmx_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
void gen_sse_record_cr6(int vD);
bool gen_sse_arith_2(int mnemo, int vD, int vA, int vB);
bool gen_sse_arith_s(int mnemo, int vD, int vA, int vB);
bool gen_sse_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
bool gen_sse_vmaddfp(int mnemo, int vD, int vA, int vB, int vC);
bool gen_sse_vnmsubfp(int mnemo, int vD, int vA, int vB, int vC);
void gen_sse2_record_cr6(int vD);
bool gen_sse2_arith_2(int mnemo, int vD, int vA, int vB);
bool gen_sse2_arith_s(int mnemo, int vD, int vA, int vB);
bool gen_sse2_arith_c(int mnemo, int vD, int vA, int vB, bool Rc);
bool gen_sse2_vsldoi(int mnemo, int vD, int vA, int vB, int SH);
void gen_sse2_vsplat(int vD, int rValue);
bool gen_sse2_vspltisb(int mnemo, int vD, int SIMM);
bool gen_sse2_vspltish(int mnemo, int vD, int SIMM);
bool gen_sse2_vspltisw(int mnemo, int vD, int SIMM);
bool gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB);
bool gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB);
bool gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB);
#endif
};
#endif /* PPC_JIT_H */

View File

@ -1346,67 +1346,16 @@ powerpc_cpu::compile_block(uint32 entry_point)
break;
}
#endif
// NOTE: A0/VD are clobbered in the following instructions!
case PPC_I(LVEWX):
case PPC_I(LVX):
case PPC_I(LVXL):
{
const int rA = rA_field::extract(opcode);
const int rB = rB_field::extract(opcode);
const int vD = vD_field::extract(opcode);
dg.gen_load_T0_GPR(rB);
if (rA != 0) {
dg.gen_load_T1_GPR(rA);
dg.gen_add_32_T0_T1();
}
switch (ii->mnemo) {
case PPC_I(LVEWX): dg.gen_load_word_VD_T0(vD); break;
case PPC_I(LVX): dg.gen_load_vect_VD_T0(vD); break;
case PPC_I(LVXL): dg.gen_load_vect_VD_T0(vD); break;
}
break;
}
case PPC_I(STVEWX):
case PPC_I(STVX):
case PPC_I(STVXL):
{
const int rA = rA_field::extract(opcode);
const int rB = rB_field::extract(opcode);
const int vS = vS_field::extract(opcode);
dg.gen_load_T0_GPR(rB);
if (rA != 0) {
dg.gen_load_T1_GPR(rA);
dg.gen_add_32_T0_T1();
}
switch (ii->mnemo) {
case PPC_I(STVEWX): dg.gen_store_word_VS_T0(vS); break;
case PPC_I(STVX): dg.gen_store_vect_VS_T0(vS); break;
case PPC_I(STVXL): dg.gen_store_vect_VS_T0(vS); break;
}
break;
}
case PPC_I(MFVSCR):
{
dg.gen_load_ad_VD_VR(vD_field::extract(opcode));
dg.gen_mfvscr_VD();
break;
}
case PPC_I(MTVSCR):
{
dg.gen_load_ad_V0_VR(vB_field::extract(opcode));
dg.gen_mtvscr_V0();
break;
}
case PPC_I(VSLDOI):
{
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
const int vB = vB_field::extract(opcode);
const int SH = vSH_field::extract(opcode);
if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH))
goto do_generic;
break;
}
assert(vD_field::mask() == vS_field::mask());
assert(vA_field::mask() == rA_field::mask());
assert(vB_field::mask() == rB_field::mask());
// fall-through
case PPC_I(VCMPEQFP):
case PPC_I(VCMPEQUB):
case PPC_I(VCMPEQUH):
@ -1420,7 +1369,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
const int vB = vB_field::extract(opcode);
if (!dg.gen_vector_2(ii->mnemo, vD, vA, vB, vRc_field::test(opcode)))
if (!dg.gen_vector_compare(ii->mnemo, vD, vA, vB, vRc_field::test(opcode)))
goto do_generic;
break;
}
@ -1443,6 +1392,8 @@ powerpc_cpu::compile_block(uint32 entry_point)
case PPC_I(VSUBUHM):
case PPC_I(VSUBUWM):
case PPC_I(VXOR):
case PPC_I(VREFP):
case PPC_I(VRSQRTEFP):
{
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
@ -1462,6 +1413,49 @@ powerpc_cpu::compile_block(uint32 entry_point)
goto do_generic;
break;
}
case PPC_I(VSLDOI):
{
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
const int vB = vB_field::extract(opcode);
const int SH = vSH_field::extract(opcode);
if (!dg.gen_vector_3(ii->mnemo, vD, vA, vB, SH))
goto do_generic;
break;
}
case PPC_I(MFVSCR):
{
if (!dg.gen_vector_1(ii->mnemo, vD_field::extract(opcode)))
goto do_generic;
break;
}
case PPC_I(MTVSCR):
{
if (!dg.gen_vector_1(ii->mnemo, vB_field::extract(opcode)))
goto do_generic;
break;
}
case PPC_I(VSPLTISB):
case PPC_I(VSPLTISH):
case PPC_I(VSPLTISW):
{
const int vD = vD_field::extract(opcode);
const int SIMM = op_sign_extend_5_32::apply(vUIMM_field::extract(opcode));
if (!dg.gen_vector_2(ii->mnemo, vD, SIMM, 0))
goto do_generic;
break;
}
case PPC_I(VSPLTB):
case PPC_I(VSPLTH):
case PPC_I(VSPLTW):
{
const int vD = vD_field::extract(opcode);
const int UIMM = vUIMM_field::extract(opcode);
const int vB = vB_field::extract(opcode);
if (!dg.gen_vector_2(ii->mnemo, vD, UIMM, vB))
goto do_generic;
break;
}
default: // Direct call to instruction handler
{
typedef void (*func_t)(dyngen_cpu_base, uint32);