diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp index 8e81c575..4d425071 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen-ops.cpp @@ -39,28 +39,33 @@ register struct powerpc_cpu *CPU asm(REG_CPU); #define REG32(X) X #endif #define FPREG(X) ((powerpc_fpr *)(X)) -#define VREG(X) ((powerpc_vr *)(X))[0] #define A0 REG32(reg_A0) -#define VD VREG(reg_A0) register uintptr reg_A0 asm(REG_A0); #define T0 REG32(reg_T0) #define F0 FPREG(reg_T0)->d #define F0_dw FPREG(reg_T0)->j -#define V0 VREG(reg_T0) register uintptr reg_T0 asm(REG_T0); #define T1 REG32(reg_T1) #define F1 FPREG(reg_T1)->d #define F1_dw FPREG(reg_T1)->j -#define V1 VREG(reg_T1) register uintptr reg_T1 asm(REG_T1); #define T2 REG32(reg_T2) #define F2 FPREG(reg_T2)->d #define F2_dw FPREG(reg_T2)->j -#define V2 VREG(reg_T2) register uintptr reg_T2 asm(REG_T2); #define FD powerpc_dyngen_helper::fp_result() #define FD_dw powerpc_dyngen_helper::fp_result_dw() +// Vector registers +#define VREG(X) ((powerpc_vr *)(X))[0] +#define VD VREG(reg_VD) +#define reg_VD reg_A0 +#define V0 VREG(reg_V0) +#define reg_V0 reg_T0 +#define V1 VREG(reg_V1) +#define reg_V1 reg_T1 +#define V2 VREG(reg_V2) +#define reg_V2 reg_T2 /** * Helper class to access protected CPU context @@ -1416,9 +1421,21 @@ void op_vmaddfp_VD_V0_V1_V2(void) vector_execute::apply(); } +#if defined(__i386__) && defined(__SSE__) +// Workaround gcc 3.2.2 miscompilation that inserts SSE instructions +struct op_do_vnmsubfp { + static inline float apply(float x, float y, float z) { +// return 0. - ((x * z) - y); + return y - (x * z); + } +}; +#else +typedef op_vnmsubfp op_do_vnmsubfp; +#endif + void op_vnmsubfp_VD_V0_V1_V2(void) { - vector_execute::apply(); + vector_execute::apply(); } void op_vmaxfp_VD_V0_V1(void) @@ -1456,14 +1473,248 @@ void op_vxor_VD_V0_V1(void) vector_execute::apply(); } -#ifdef LONG_OPERATIONS -void op_vcmpeqfp_VD_V0_V1(void) +void op_record_cr6_VD(void) { - vector_execute, V4SF, V4SF, V4SF>::apply(); + if (VD.j[0] == UVAL64(0xffffffffffffffff) && + VD.j[1] == UVAL64(0xffffffffffffffff)) + powerpc_dyngen_helper::cr().set(6, 8); + else if (VD.j[0] == UVAL64(0) && VD.j[1] == UVAL64(0)) + powerpc_dyngen_helper::cr().set(6, 2); + else + powerpc_dyngen_helper::cr().set(6, 0); + dyngen_barrier(); } -void op_vaddubm_VD_V0_V1(void) +/** + * SSE optimizations + **/ + +#if defined(__SSE__) +#include +#undef VD +#define VD *((__m128 *)reg_VD) +#undef V0 +#define V0 *((__m128 *)reg_V0) +#undef V1 +#define V1 *((__m128 *)reg_V1) +#undef V2 +#define V2 *((__m128 *)reg_V2) + +void op_sse_nop(void) { - vector_execute, V16QI, V16QI, V16QI>::apply(); + asm volatile ("nop"); +} + +void op_sse_vcmpeqfp(void) +{ + VD = _mm_cmpeq_ps(V0, V1); +} + +void op_sse_vcmpgefp(void) +{ + VD = _mm_cmpge_ps(V0, V1); +} + +void op_sse_vcmpgtfp(void) +{ + VD = _mm_cmpgt_ps(V0, V1); +} + +void op_sse_vaddfp(void) +{ + VD = _mm_add_ps(V0, V1); +} + +void op_sse_vsubfp(void) +{ + VD = _mm_sub_ps(V0, V1); +} + +void op_sse_vmaddfp(void) +{ + VD = _mm_add_ps(_mm_mul_ps(V0, V2), V1); +} + +void op_sse_vnmsubfp(void) +{ + VD = _mm_sub_ps(_mm_setzero_ps(), _mm_sub_ps(_mm_mul_ps(V0, V2), V1)); +} + +void op_sse_vmaxfp(void) +{ + VD = _mm_max_ps(V0, V1); +} + +void op_sse_vminfp(void) +{ + VD = _mm_min_ps(V0, V1); +} + +void op_sse_vand(void) +{ + VD = _mm_and_ps(V0, V1); +} + +void op_sse_vandc(void) +{ + VD = _mm_andnot_ps(V1, V0); +} + +void op_sse_vor(void) +{ + VD = _mm_or_ps(V0, V1); +} + +void op_sse_vxor(void) +{ + VD = _mm_xor_ps(V0, V1); } #endif + +/** + * MMX optimizations + **/ + +#if defined(__MMX__) +#include +#undef VD +#define VD ((__m64 *)reg_VD) +#undef V0 +#define V0 ((__m64 *)reg_V0) +#undef V1 +#define V1 ((__m64 *)reg_V1) +#undef V2 +#define V2 ((__m64 *)reg_V2) + +void op_mmx_nop(void) +{ + asm volatile ("nop"); +} + +void op_emms(void) +{ + _mm_empty(); +} + +void op_mmx_vcmpequb(void) +{ + VD[0] = _mm_cmpeq_pi8(V0[0], V1[0]); + VD[1] = _mm_cmpeq_pi8(V0[1], V1[1]); +} + +void op_mmx_vcmpequh(void) +{ + VD[0] = _mm_cmpeq_pi16(V0[0], V1[0]); + VD[1] = _mm_cmpeq_pi16(V0[1], V1[1]); +} + +void op_mmx_vcmpequw(void) +{ + VD[0] = _mm_cmpeq_pi32(V0[0], V1[0]); + VD[1] = _mm_cmpeq_pi32(V0[1], V1[1]); +} + +void op_mmx_vcmpgtsb(void) +{ + VD[0] = _mm_cmpgt_pi8(V0[0], V1[0]); + VD[1] = _mm_cmpgt_pi8(V0[1], V1[1]); +} + +void op_mmx_vcmpgtsh(void) +{ + VD[0] = _mm_cmpgt_pi16(V0[0], V1[0]); + VD[1] = _mm_cmpgt_pi16(V0[1], V1[1]); +} + +void op_mmx_vcmpgtsw(void) +{ + VD[0] = _mm_cmpgt_pi32(V0[0], V1[0]); + VD[1] = _mm_cmpgt_pi32(V0[1], V1[1]); +} + +void op_mmx_vaddubm(void) +{ + VD[0] = _mm_add_pi8(V0[0], V1[0]); + VD[1] = _mm_add_pi8(V0[1], V1[1]); +} + +void op_mmx_vadduhm(void) +{ + VD[0] = _mm_add_pi16(V0[0], V1[0]); + VD[1] = _mm_add_pi16(V0[1], V1[1]); +} + +void op_mmx_vadduwm(void) +{ + VD[0] = _mm_add_pi32(V0[0], V1[0]); + VD[1] = _mm_add_pi32(V0[1], V1[1]); +} + +void op_mmx_vsububm(void) +{ + VD[0] = _mm_sub_pi8(V0[0], V1[0]); + VD[1] = _mm_sub_pi8(V0[1], V1[1]); +} + +void op_mmx_vsubuhm(void) +{ + VD[0] = _mm_sub_pi16(V0[0], V1[0]); + VD[1] = _mm_sub_pi16(V0[1], V1[1]); +} + +void op_mmx_vsubuwm(void) +{ + VD[0] = _mm_sub_pi32(V0[0], V1[0]); + VD[1] = _mm_sub_pi32(V0[1], V1[1]); +} + +void op_mmx_vand(void) +{ + VD[0] = _mm_and_si64(V0[0], V1[0]); + VD[1] = _mm_and_si64(V0[1], V1[1]); +} + +void op_mmx_vandc(void) +{ + VD[0] = _mm_andnot_si64(V1[0], V0[0]); + VD[1] = _mm_andnot_si64(V1[1], V0[1]); +} + +void op_mmx_vor(void) +{ + VD[0] = _mm_or_si64(V0[0], V1[0]); + VD[1] = _mm_or_si64(V0[1], V1[1]); +} + +void op_mmx_vxor(void) +{ + VD[0] = _mm_xor_si64(V0[0], V1[0]); + VD[1] = _mm_xor_si64(V0[1], V1[1]); +} + +#if defined(__SSE__) +void op_mmx_vmaxub(void) +{ + VD[0] = _mm_max_pu8(V0[0], V1[0]); + VD[1] = _mm_max_pu8(V0[1], V1[1]); +} + +void op_mmx_vminub(void) +{ + VD[0] = _mm_min_pu8(V0[0], V1[0]); + VD[1] = _mm_min_pu8(V0[1], V1[1]); +} + +void op_mmx_vmaxsh(void) +{ + VD[0] = _mm_max_pi16(V0[0], V1[0]); + VD[1] = _mm_max_pi16(V0[1], V1[1]); +} + +void op_mmx_vminsh(void) +{ + VD[0] = _mm_min_pi16(V0[0], V1[0]); + VD[1] = _mm_min_pi16(V0[1], V1[1]); +} +#endif +#endif diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp index 63e53338..1088afc6 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.cpp @@ -21,6 +21,7 @@ #include "sysdeps.h" #include "cpu/ppc/ppc-dyngen.hpp" #include "cpu/ppc/ppc-bitfields.hpp" +#include "cpu/ppc/ppc-instructions.hpp" #include #include @@ -29,6 +30,65 @@ #define DEFINE_GEN(NAME,ARGS) void powerpc_dyngen::NAME ARGS #include "ppc-dyngen-ops.hpp" + +/** + * Determine x86 CPU features + **/ + +/* XXX: move that in CPU dependent bits */ +#if defined(__i386__) || defined(__x86_64__) +static uint32 cpu_features = 0; + +enum { + HWCAP_I386_CMOV = 1 << 15, + HWCAP_I386_MMX = 1 << 23, + HWCAP_I386_SSE = 1 << 25, + HWCAP_I386_SSE2 = 1 << 26, +}; + +static unsigned int x86_cpuid(void) +{ + int fl1, fl2; + +#ifndef __x86_64__ + /* See if we can use cpuid. On AMD64 we always can. */ + __asm__ ("pushfl; pushfl; popl %0; movl %0,%1; xorl %2,%0;" + "pushl %0; popfl; pushfl; popl %0; popfl" + : "=&r" (fl1), "=&r" (fl2) + : "i" (0x00200000)); + if (((fl1 ^ fl2) & 0x00200000) == 0) + return (0); +#endif + + /* Host supports cpuid. See if cpuid gives capabilities, try + CPUID(0). Preserve %ebx and %ecx; cpuid insn clobbers these, we + don't need their CPUID values here, and %ebx may be the PIC + register. */ + __asm__ ("push %%ecx ; push %%ebx ; cpuid ; pop %%ebx ; pop %%ecx" + : "=a" (fl1) : "0" (0) : "edx", "cc"); + if (fl1 == 0) + return (0); + + /* Invoke CPUID(1), return %edx; caller can examine bits to + determine what's supported. */ +#ifdef __x86_64__ + __asm__ ("push %%rcx ; push %%rbx ; cpuid ; pop %%rbx ; pop %%rcx" : "=d" (fl2) : "a" (1) : "cc"); +#else + __asm__ ("push %%ecx ; push %%ebx ; cpuid ; pop %%ebx ; pop %%ecx" : "=d" (fl2) : "a" (1) : "cc"); +#endif + + return fl2; +} +#endif + +powerpc_dyngen::powerpc_dyngen(dyngen_cpu_base cpu, int cache_size) + : basic_dyngen(cpu, cache_size) +{ +#if defined(__i386__) || defined(__x86_64__) + cpu_features = x86_cpuid(); +#endif +} + void powerpc_dyngen::gen_compare_T0_T1(int crf) { gen_op_compare_T0_T1(); @@ -267,92 +327,129 @@ void powerpc_dyngen::gen_store_vect_VS_T0(int vS) gen_op_store_vect_VD_T0(); } -void powerpc_dyngen::gen_vaddfp(int vD, int vA, int vB) +/** + * Code generators for AltiVec instructions + **/ + +powerpc_dyngen::gen_handler_t +powerpc_dyngen::vector_codegen(int insn) { - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vaddfp_VD_V0_V1(); + gen_handler_t gen_op = 0; + switch (insn) { +#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_##NAME) + case PPC_I(VADDFP): gen_op = GEN_OP(vaddfp_VD_V0_V1); break; + case PPC_I(VSUBFP): gen_op = GEN_OP(vsubfp_VD_V0_V1); break; + case PPC_I(VMADDFP): gen_op = GEN_OP(vmaddfp_VD_V0_V1_V2); break; + case PPC_I(VNMSUBFP): gen_op = GEN_OP(vnmsubfp_VD_V0_V1_V2); break; + case PPC_I(VMAXFP): gen_op = GEN_OP(vmaxfp_VD_V0_V1); break; + case PPC_I(VMINFP): gen_op = GEN_OP(vminfp_VD_V0_V1); break; + case PPC_I(VAND): gen_op = GEN_OP(vand_VD_V0_V1); break; + case PPC_I(VANDC): gen_op = GEN_OP(vandc_VD_V0_V1); break; + case PPC_I(VNOR): gen_op = GEN_OP(vnor_VD_V0_V1); break; + case PPC_I(VOR): gen_op = GEN_OP(vor_VD_V0_V1); break; + case PPC_I(VXOR): gen_op = GEN_OP(vxor_VD_V0_V1); break; +#undef GEN_OP + } + return gen_op; } -void powerpc_dyngen::gen_vsubfp(int vD, int vA, int vB) +#if defined(__i386__) || defined(__x86_64__) +powerpc_dyngen::gen_handler_t +powerpc_dyngen::vector_codegen_mmx(int insn) { - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vsubfp_VD_V0_V1(); +#ifdef HAVE_gen_op_mmx_nop + if (!(cpu_features & HWCAP_I386_MMX)) + return 0; + + /* XXX: auto-generate the table with individual handlers */ + gen_handler_t gen_op = 0; + switch (insn) { +#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_mmx_##NAME) + case PPC_I(VADDUBM): gen_op = GEN_OP(vaddubm); break; + case PPC_I(VADDUHM): gen_op = GEN_OP(vadduhm); break; + case PPC_I(VADDUWM): gen_op = GEN_OP(vadduwm); break; + case PPC_I(VAND): gen_op = GEN_OP(vand); break; + case PPC_I(VANDC): gen_op = GEN_OP(vandc); break; + case PPC_I(VCMPEQUB): gen_op = GEN_OP(vcmpequb); break; + case PPC_I(VCMPEQUH): gen_op = GEN_OP(vcmpequh); break; + case PPC_I(VCMPEQUW): gen_op = GEN_OP(vcmpequw); break; + case PPC_I(VCMPGTSB): gen_op = GEN_OP(vcmpgtsb); break; + case PPC_I(VCMPGTSH): gen_op = GEN_OP(vcmpgtsh); break; + case PPC_I(VCMPGTSW): gen_op = GEN_OP(vcmpgtsw); break; + case PPC_I(VOR): gen_op = GEN_OP(vor); break; + case PPC_I(VSUBUBM): gen_op = GEN_OP(vsububm); break; + case PPC_I(VSUBUHM): gen_op = GEN_OP(vsubuhm); break; + case PPC_I(VSUBUWM): gen_op = GEN_OP(vsubuwm); break; + case PPC_I(VXOR): gen_op = GEN_OP(vxor); break; +#undef GEN_OP + } + +#ifdef HAVE_gen_op_sse_nop + if (gen_op.ptr()) + return gen_op; + + if (!(cpu_features & HWCAP_I386_SSE)) + return 0; + + /* XXX: is the MMX unit really used for those? */ + switch (insn) { +#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_mmx_##NAME) + case PPC_I(VMAXSH): gen_op = GEN_OP(vmaxsh); break; + case PPC_I(VMAXUB): gen_op = GEN_OP(vmaxub); break; + case PPC_I(VMINSH): gen_op = GEN_OP(vminsh); break; + case PPC_I(VMINUB): gen_op = GEN_OP(vminub); break; +#undef GEN_OP + } +#endif + return gen_op; +#endif + + return 0; } -void powerpc_dyngen::gen_vmaddfp(int vD, int vA, int vB, int vC) +powerpc_dyngen::gen_handler_t +powerpc_dyngen::vector_codegen_sse(int insn) { - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_load_ad_V2_VR(vC); - gen_op_vmaddfp_VD_V0_V1_V2(); +#ifdef HAVE_gen_op_sse_nop + if (!(cpu_features & HWCAP_I386_SSE)) + return 0; + + /* XXX: auto-generate the table with individual handlers */ + gen_handler_t gen_op = 0; + switch (insn) { +#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_sse_##NAME) + case PPC_I(VADDFP): gen_op = GEN_OP(vaddfp); break; + case PPC_I(VAND): gen_op = GEN_OP(vand); break; + case PPC_I(VANDC): gen_op = GEN_OP(vandc); break; + case PPC_I(VCMPEQFP): gen_op = GEN_OP(vcmpeqfp); break; + case PPC_I(VCMPGEFP): gen_op = GEN_OP(vcmpgefp); break; + case PPC_I(VCMPGTFP): gen_op = GEN_OP(vcmpgtfp); break; + case PPC_I(VMADDFP): gen_op = GEN_OP(vmaddfp); break; + case PPC_I(VMAXFP): gen_op = GEN_OP(vmaxfp); break; + case PPC_I(VMINFP): gen_op = GEN_OP(vminfp); break; + case PPC_I(VNMSUBFP): gen_op = GEN_OP(vnmsubfp); break; + case PPC_I(VOR): gen_op = GEN_OP(vor); break; + case PPC_I(VSUBFP): gen_op = GEN_OP(vsubfp); break; + case PPC_I(VXOR): gen_op = GEN_OP(vxor); break; +#undef GEN_OP + } + return gen_op; +#endif + + return 0; } -void powerpc_dyngen::gen_vnmsubfp(int vD, int vA, int vB, int vC) +powerpc_dyngen::gen_handler_t +powerpc_dyngen::vector_codegen_sse2(int insn) { - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_load_ad_V2_VR(vC); - gen_op_vnmsubfp_VD_V0_V1_V2(); + return 0; } -void powerpc_dyngen::gen_vmaxfp(int vD, int vA, int vB) +void powerpc_dyngen::gen_mmx_clear(void) { - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vmaxfp_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vminfp(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vminfp_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vand(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vand_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vandc(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vandc_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vnor(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vnor_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vor(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vor_VD_V0_V1(); -} - -void powerpc_dyngen::gen_vxor(int vD, int vA, int vB) -{ - gen_load_ad_VD_VR(vD); - gen_load_ad_V0_VR(vA); - gen_load_ad_V1_VR(vB); - gen_op_vxor_VD_V0_V1(); +#ifdef HAVE_gen_op_mmx_nop + if (cpu_features & HWCAP_I386_MMX) + gen_op_emms(); +#endif } +#endif diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp index 0ff93b1c..333c7552 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-dyngen.hpp @@ -22,6 +22,7 @@ #define PPC_DYNGEN_H #include "sysdeps.h" +#include "nvmemfun.hpp" #include "cpu/ppc/ppc-config.hpp" #if PPC_ENABLE_JIT @@ -42,10 +43,11 @@ public: // Make rc_cache accessible to codegen helper friend class powerpc_dyngen_helper; + // Code generators + typedef nv_mem_fun_t< void, powerpc_dyngen > gen_handler_t; + // Default constructor - powerpc_dyngen(dyngen_cpu_base cpu, int cache_size = -1) - : basic_dyngen(cpu, cache_size) - { } + powerpc_dyngen(dyngen_cpu_base cpu, int cache_size = -1); // Load/store registers void gen_load_A0_GPR(int i); @@ -228,17 +230,16 @@ public: void gen_load_vect_VD_T0(int vD); void gen_store_word_VS_T0(int vS); void gen_store_vect_VS_T0(int vS); - void gen_vaddfp(int vD, int vA, int vB); - void gen_vsubfp(int vD, int vA, int vB); - void gen_vmaddfp(int vD, int vA, int vB, int vC); - void gen_vnmsubfp(int vD, int vA, int vB, int vC); - void gen_vmaxfp(int vD, int vA, int vB); - void gen_vminfp(int vD, int vA, int vB); - void gen_vand(int vD, int vA, int vB); - void gen_vandc(int vD, int vA, int vB); - void gen_vnor(int vD, int vA, int vB); - void gen_vor(int vD, int vA, int vB); - void gen_vxor(int vD, int vA, int vB); + DEFINE_ALIAS(record_cr6_VD,0); + + // Code generators for AltiVec instructions + gen_handler_t vector_codegen(int insn); +#if defined(__i386__) || defined(__x86_64__) + gen_handler_t vector_codegen_mmx(int insn); + gen_handler_t vector_codegen_sse(int insn); + gen_handler_t vector_codegen_sse2(int insn); + void gen_mmx_clear(void); +#endif #undef DEFINE_ALIAS #undef DEFINE_ALIAS_0 diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp index 7302a598..debe5ffe 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-registers.hpp @@ -210,8 +210,13 @@ union powerpc_vr uint32 w[4]; uint64 j[2]; float f[4]; - double d[2]; -}; +} +#if defined(__GNUC__) +// 16-byte alignment is required for SIMD optimizations operating on +// 128-bit aligned registers (e.g. SSE). +__attribute__((aligned(16))) +#endif +; /** diff --git a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp index 2c42733d..5ec54a29 100644 --- a/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp +++ b/SheepShaver/src/kpx_cpu/src/cpu/ppc/ppc-translate.cpp @@ -1309,33 +1309,68 @@ powerpc_cpu::compile_block(uint32 entry_point) break; } case PPC_I(VADDFP): - case PPC_I(VSUBFP): - case PPC_I(VMADDFP): - case PPC_I(VNMSUBFP): - case PPC_I(VMAXFP): - case PPC_I(VMINFP): + case PPC_I(VADDUBM): + case PPC_I(VADDUHM): + case PPC_I(VADDUWM): case PPC_I(VAND): case PPC_I(VANDC): + case PPC_I(VCMPEQFP): + case PPC_I(VCMPEQUB): + case PPC_I(VCMPEQUH): + case PPC_I(VCMPEQUW): + case PPC_I(VCMPGEFP): + case PPC_I(VCMPGTFP): + case PPC_I(VCMPGTSB): + case PPC_I(VCMPGTSH): + case PPC_I(VCMPGTSW): + case PPC_I(VMADDFP): + case PPC_I(VMAXFP): + case PPC_I(VMAXSH): + case PPC_I(VMAXUB): + case PPC_I(VMINFP): + case PPC_I(VMINSH): + case PPC_I(VMINUB): + case PPC_I(VNMSUBFP): case PPC_I(VNOR): case PPC_I(VOR): + case PPC_I(VSUBFP): + case PPC_I(VSUBUBM): + case PPC_I(VSUBUHM): + case PPC_I(VSUBUWM): case PPC_I(VXOR): { - const int vD = vD_field::extract(opcode); - const int vA = vA_field::extract(opcode); - const int vB = vB_field::extract(opcode); - switch (ii->mnemo) { - case PPC_I(VADDFP): dg.gen_vaddfp(vD, vA, vB); break; - case PPC_I(VSUBFP): dg.gen_vsubfp(vD, vA, vB); break; - case PPC_I(VMADDFP): dg.gen_vmaddfp(vD, vA, vB, vC_field::extract(opcode)); break; - case PPC_I(VNMSUBFP): dg.gen_vnmsubfp(vD, vA, vB, vC_field::extract(opcode)); break; - case PPC_I(VMAXFP): dg.gen_vmaxfp(vD, vA, vB); break; - case PPC_I(VMINFP): dg.gen_vminfp(vD, vA, vB); break; - case PPC_I(VAND): dg.gen_vand(vD, vA, vB); break; - case PPC_I(VANDC): dg.gen_vandc(vD, vA, vB); break; - case PPC_I(VNOR): dg.gen_vnor(vD, vA, vB); break; - case PPC_I(VOR): dg.gen_vor(vD, vA, vB); break; - case PPC_I(VXOR): dg.gen_vxor(vD, vA, vB); break; + powerpc_dyngen::gen_handler_t gen_op = 0; +#if defined(__i386__) || defined(__x86_64__) + /* XXX: analyze the block function */ + bool mmx_used = false; + + if ((gen_op = dg.vector_codegen_sse(ii->mnemo)).ptr()) { + /* SSE code generator available */ } + else if ((gen_op = dg.vector_codegen_mmx(ii->mnemo)).ptr()) { + /* MMX code generator available */ + mmx_used = true; + } + else +#endif + gen_op = dg.vector_codegen(ii->mnemo); + + if (!gen_op.ptr()) + goto do_generic; + + dg.gen_load_ad_VD_VR(vD_field::extract(opcode)); + dg.gen_load_ad_V0_VR(vA_field::extract(opcode)); + dg.gen_load_ad_V1_VR(vB_field::extract(opcode)); + if (ii->format == VA_form) + dg.gen_load_ad_V2_VR(vC_field::extract(opcode)); + gen_op(&dg); + if (ii->format == VXR_form && vRc_field::test(opcode)) + dg.gen_record_cr6_VD(); + +#if defined(__i386__) || defined(__x86_64__) + if (mmx_used) + dg.gen_mmx_clear(); +#endif break; } default: // Direct call to instruction handler