First round of SSE/MMX optimizations & experimentations. AltiVec Fractal

Carbon performance increased by a factor 8 (420 MegaFlops).
This commit is contained in:
gbeauche 2004-02-20 17:18:44 +00:00
parent 2b1f76f343
commit 443231c1da
5 changed files with 511 additions and 122 deletions

View File

@ -39,28 +39,33 @@ register struct powerpc_cpu *CPU asm(REG_CPU);
#define REG32(X) X
#endif
#define FPREG(X) ((powerpc_fpr *)(X))
#define VREG(X) ((powerpc_vr *)(X))[0]
#define A0 REG32(reg_A0)
#define VD VREG(reg_A0)
register uintptr reg_A0 asm(REG_A0);
#define T0 REG32(reg_T0)
#define F0 FPREG(reg_T0)->d
#define F0_dw FPREG(reg_T0)->j
#define V0 VREG(reg_T0)
register uintptr reg_T0 asm(REG_T0);
#define T1 REG32(reg_T1)
#define F1 FPREG(reg_T1)->d
#define F1_dw FPREG(reg_T1)->j
#define V1 VREG(reg_T1)
register uintptr reg_T1 asm(REG_T1);
#define T2 REG32(reg_T2)
#define F2 FPREG(reg_T2)->d
#define F2_dw FPREG(reg_T2)->j
#define V2 VREG(reg_T2)
register uintptr reg_T2 asm(REG_T2);
#define FD powerpc_dyngen_helper::fp_result()
#define FD_dw powerpc_dyngen_helper::fp_result_dw()
// Vector registers
#define VREG(X) ((powerpc_vr *)(X))[0]
#define VD VREG(reg_VD)
#define reg_VD reg_A0
#define V0 VREG(reg_V0)
#define reg_V0 reg_T0
#define V1 VREG(reg_V1)
#define reg_V1 reg_T1
#define V2 VREG(reg_V2)
#define reg_V2 reg_T2
/**
* Helper class to access protected CPU context
@ -1416,9 +1421,21 @@ void op_vmaddfp_VD_V0_V1_V2(void)
vector_execute<op_vmaddfp, V4SF, V4SF, V4SF, V4SF>::apply();
}
#if defined(__i386__) && defined(__SSE__)
// Workaround gcc 3.2.2 miscompilation that inserts SSE instructions
struct op_do_vnmsubfp {
static inline float apply(float x, float y, float z) {
// return 0. - ((x * z) - y);
return y - (x * z);
}
};
#else
typedef op_vnmsubfp op_do_vnmsubfp;
#endif
void op_vnmsubfp_VD_V0_V1_V2(void)
{
vector_execute<op_vnmsubfp, V4SF, V4SF, V4SF, V4SF>::apply();
vector_execute<op_do_vnmsubfp, V4SF, V4SF, V4SF, V4SF>::apply();
}
void op_vmaxfp_VD_V0_V1(void)
@ -1456,14 +1473,248 @@ void op_vxor_VD_V0_V1(void)
vector_execute<op_xor_64, V2DI, V2DI, V2DI>::apply();
}
#ifdef LONG_OPERATIONS
void op_vcmpeqfp_VD_V0_V1(void)
void op_record_cr6_VD(void)
{
vector_execute<op_cmp_eq<float>, V4SF, V4SF, V4SF>::apply();
if (VD.j[0] == UVAL64(0xffffffffffffffff) &&
VD.j[1] == UVAL64(0xffffffffffffffff))
powerpc_dyngen_helper::cr().set(6, 8);
else if (VD.j[0] == UVAL64(0) && VD.j[1] == UVAL64(0))
powerpc_dyngen_helper::cr().set(6, 2);
else
powerpc_dyngen_helper::cr().set(6, 0);
dyngen_barrier();
}
void op_vaddubm_VD_V0_V1(void)
/**
* SSE optimizations
**/
#if defined(__SSE__)
#include <xmmintrin.h>
#undef VD
#define VD *((__m128 *)reg_VD)
#undef V0
#define V0 *((__m128 *)reg_V0)
#undef V1
#define V1 *((__m128 *)reg_V1)
#undef V2
#define V2 *((__m128 *)reg_V2)
void op_sse_nop(void)
{
vector_execute<op_template_add<uint8>, V16QI, V16QI, V16QI>::apply();
asm volatile ("nop");
}
void op_sse_vcmpeqfp(void)
{
VD = _mm_cmpeq_ps(V0, V1);
}
void op_sse_vcmpgefp(void)
{
VD = _mm_cmpge_ps(V0, V1);
}
void op_sse_vcmpgtfp(void)
{
VD = _mm_cmpgt_ps(V0, V1);
}
void op_sse_vaddfp(void)
{
VD = _mm_add_ps(V0, V1);
}
void op_sse_vsubfp(void)
{
VD = _mm_sub_ps(V0, V1);
}
void op_sse_vmaddfp(void)
{
VD = _mm_add_ps(_mm_mul_ps(V0, V2), V1);
}
void op_sse_vnmsubfp(void)
{
VD = _mm_sub_ps(_mm_setzero_ps(), _mm_sub_ps(_mm_mul_ps(V0, V2), V1));
}
void op_sse_vmaxfp(void)
{
VD = _mm_max_ps(V0, V1);
}
void op_sse_vminfp(void)
{
VD = _mm_min_ps(V0, V1);
}
void op_sse_vand(void)
{
VD = _mm_and_ps(V0, V1);
}
void op_sse_vandc(void)
{
VD = _mm_andnot_ps(V1, V0);
}
void op_sse_vor(void)
{
VD = _mm_or_ps(V0, V1);
}
void op_sse_vxor(void)
{
VD = _mm_xor_ps(V0, V1);
}
#endif
/**
* MMX optimizations
**/
#if defined(__MMX__)
#include <mmintrin.h>
#undef VD
#define VD ((__m64 *)reg_VD)
#undef V0
#define V0 ((__m64 *)reg_V0)
#undef V1
#define V1 ((__m64 *)reg_V1)
#undef V2
#define V2 ((__m64 *)reg_V2)
void op_mmx_nop(void)
{
asm volatile ("nop");
}
void op_emms(void)
{
_mm_empty();
}
void op_mmx_vcmpequb(void)
{
VD[0] = _mm_cmpeq_pi8(V0[0], V1[0]);
VD[1] = _mm_cmpeq_pi8(V0[1], V1[1]);
}
void op_mmx_vcmpequh(void)
{
VD[0] = _mm_cmpeq_pi16(V0[0], V1[0]);
VD[1] = _mm_cmpeq_pi16(V0[1], V1[1]);
}
void op_mmx_vcmpequw(void)
{
VD[0] = _mm_cmpeq_pi32(V0[0], V1[0]);
VD[1] = _mm_cmpeq_pi32(V0[1], V1[1]);
}
void op_mmx_vcmpgtsb(void)
{
VD[0] = _mm_cmpgt_pi8(V0[0], V1[0]);
VD[1] = _mm_cmpgt_pi8(V0[1], V1[1]);
}
void op_mmx_vcmpgtsh(void)
{
VD[0] = _mm_cmpgt_pi16(V0[0], V1[0]);
VD[1] = _mm_cmpgt_pi16(V0[1], V1[1]);
}
void op_mmx_vcmpgtsw(void)
{
VD[0] = _mm_cmpgt_pi32(V0[0], V1[0]);
VD[1] = _mm_cmpgt_pi32(V0[1], V1[1]);
}
void op_mmx_vaddubm(void)
{
VD[0] = _mm_add_pi8(V0[0], V1[0]);
VD[1] = _mm_add_pi8(V0[1], V1[1]);
}
void op_mmx_vadduhm(void)
{
VD[0] = _mm_add_pi16(V0[0], V1[0]);
VD[1] = _mm_add_pi16(V0[1], V1[1]);
}
void op_mmx_vadduwm(void)
{
VD[0] = _mm_add_pi32(V0[0], V1[0]);
VD[1] = _mm_add_pi32(V0[1], V1[1]);
}
void op_mmx_vsububm(void)
{
VD[0] = _mm_sub_pi8(V0[0], V1[0]);
VD[1] = _mm_sub_pi8(V0[1], V1[1]);
}
void op_mmx_vsubuhm(void)
{
VD[0] = _mm_sub_pi16(V0[0], V1[0]);
VD[1] = _mm_sub_pi16(V0[1], V1[1]);
}
void op_mmx_vsubuwm(void)
{
VD[0] = _mm_sub_pi32(V0[0], V1[0]);
VD[1] = _mm_sub_pi32(V0[1], V1[1]);
}
void op_mmx_vand(void)
{
VD[0] = _mm_and_si64(V0[0], V1[0]);
VD[1] = _mm_and_si64(V0[1], V1[1]);
}
void op_mmx_vandc(void)
{
VD[0] = _mm_andnot_si64(V1[0], V0[0]);
VD[1] = _mm_andnot_si64(V1[1], V0[1]);
}
void op_mmx_vor(void)
{
VD[0] = _mm_or_si64(V0[0], V1[0]);
VD[1] = _mm_or_si64(V0[1], V1[1]);
}
void op_mmx_vxor(void)
{
VD[0] = _mm_xor_si64(V0[0], V1[0]);
VD[1] = _mm_xor_si64(V0[1], V1[1]);
}
#if defined(__SSE__)
void op_mmx_vmaxub(void)
{
VD[0] = _mm_max_pu8(V0[0], V1[0]);
VD[1] = _mm_max_pu8(V0[1], V1[1]);
}
void op_mmx_vminub(void)
{
VD[0] = _mm_min_pu8(V0[0], V1[0]);
VD[1] = _mm_min_pu8(V0[1], V1[1]);
}
void op_mmx_vmaxsh(void)
{
VD[0] = _mm_max_pi16(V0[0], V1[0]);
VD[1] = _mm_max_pi16(V0[1], V1[1]);
}
void op_mmx_vminsh(void)
{
VD[0] = _mm_min_pi16(V0[0], V1[0]);
VD[1] = _mm_min_pi16(V0[1], V1[1]);
}
#endif
#endif

View File

@ -21,6 +21,7 @@
#include "sysdeps.h"
#include "cpu/ppc/ppc-dyngen.hpp"
#include "cpu/ppc/ppc-bitfields.hpp"
#include "cpu/ppc/ppc-instructions.hpp"
#include <assert.h>
#include <stdlib.h>
@ -29,6 +30,65 @@
#define DEFINE_GEN(NAME,ARGS) void powerpc_dyngen::NAME ARGS
#include "ppc-dyngen-ops.hpp"
/**
* Determine x86 CPU features
**/
/* XXX: move that in CPU dependent bits */
#if defined(__i386__) || defined(__x86_64__)
static uint32 cpu_features = 0;
enum {
HWCAP_I386_CMOV = 1 << 15,
HWCAP_I386_MMX = 1 << 23,
HWCAP_I386_SSE = 1 << 25,
HWCAP_I386_SSE2 = 1 << 26,
};
static unsigned int x86_cpuid(void)
{
int fl1, fl2;
#ifndef __x86_64__
/* See if we can use cpuid. On AMD64 we always can. */
__asm__ ("pushfl; pushfl; popl %0; movl %0,%1; xorl %2,%0;"
"pushl %0; popfl; pushfl; popl %0; popfl"
: "=&r" (fl1), "=&r" (fl2)
: "i" (0x00200000));
if (((fl1 ^ fl2) & 0x00200000) == 0)
return (0);
#endif
/* Host supports cpuid. See if cpuid gives capabilities, try
CPUID(0). Preserve %ebx and %ecx; cpuid insn clobbers these, we
don't need their CPUID values here, and %ebx may be the PIC
register. */
__asm__ ("push %%ecx ; push %%ebx ; cpuid ; pop %%ebx ; pop %%ecx"
: "=a" (fl1) : "0" (0) : "edx", "cc");
if (fl1 == 0)
return (0);
/* Invoke CPUID(1), return %edx; caller can examine bits to
determine what's supported. */
#ifdef __x86_64__
__asm__ ("push %%rcx ; push %%rbx ; cpuid ; pop %%rbx ; pop %%rcx" : "=d" (fl2) : "a" (1) : "cc");
#else
__asm__ ("push %%ecx ; push %%ebx ; cpuid ; pop %%ebx ; pop %%ecx" : "=d" (fl2) : "a" (1) : "cc");
#endif
return fl2;
}
#endif
powerpc_dyngen::powerpc_dyngen(dyngen_cpu_base cpu, int cache_size)
: basic_dyngen(cpu, cache_size)
{
#if defined(__i386__) || defined(__x86_64__)
cpu_features = x86_cpuid();
#endif
}
void powerpc_dyngen::gen_compare_T0_T1(int crf)
{
gen_op_compare_T0_T1();
@ -267,92 +327,129 @@ void powerpc_dyngen::gen_store_vect_VS_T0(int vS)
gen_op_store_vect_VD_T0();
}
void powerpc_dyngen::gen_vaddfp(int vD, int vA, int vB)
/**
* Code generators for AltiVec instructions
**/
powerpc_dyngen::gen_handler_t
powerpc_dyngen::vector_codegen(int insn)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vaddfp_VD_V0_V1();
gen_handler_t gen_op = 0;
switch (insn) {
#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_##NAME)
case PPC_I(VADDFP): gen_op = GEN_OP(vaddfp_VD_V0_V1); break;
case PPC_I(VSUBFP): gen_op = GEN_OP(vsubfp_VD_V0_V1); break;
case PPC_I(VMADDFP): gen_op = GEN_OP(vmaddfp_VD_V0_V1_V2); break;
case PPC_I(VNMSUBFP): gen_op = GEN_OP(vnmsubfp_VD_V0_V1_V2); break;
case PPC_I(VMAXFP): gen_op = GEN_OP(vmaxfp_VD_V0_V1); break;
case PPC_I(VMINFP): gen_op = GEN_OP(vminfp_VD_V0_V1); break;
case PPC_I(VAND): gen_op = GEN_OP(vand_VD_V0_V1); break;
case PPC_I(VANDC): gen_op = GEN_OP(vandc_VD_V0_V1); break;
case PPC_I(VNOR): gen_op = GEN_OP(vnor_VD_V0_V1); break;
case PPC_I(VOR): gen_op = GEN_OP(vor_VD_V0_V1); break;
case PPC_I(VXOR): gen_op = GEN_OP(vxor_VD_V0_V1); break;
#undef GEN_OP
}
return gen_op;
}
void powerpc_dyngen::gen_vsubfp(int vD, int vA, int vB)
#if defined(__i386__) || defined(__x86_64__)
powerpc_dyngen::gen_handler_t
powerpc_dyngen::vector_codegen_mmx(int insn)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vsubfp_VD_V0_V1();
#ifdef HAVE_gen_op_mmx_nop
if (!(cpu_features & HWCAP_I386_MMX))
return 0;
/* XXX: auto-generate the table with individual handlers */
gen_handler_t gen_op = 0;
switch (insn) {
#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_mmx_##NAME)
case PPC_I(VADDUBM): gen_op = GEN_OP(vaddubm); break;
case PPC_I(VADDUHM): gen_op = GEN_OP(vadduhm); break;
case PPC_I(VADDUWM): gen_op = GEN_OP(vadduwm); break;
case PPC_I(VAND): gen_op = GEN_OP(vand); break;
case PPC_I(VANDC): gen_op = GEN_OP(vandc); break;
case PPC_I(VCMPEQUB): gen_op = GEN_OP(vcmpequb); break;
case PPC_I(VCMPEQUH): gen_op = GEN_OP(vcmpequh); break;
case PPC_I(VCMPEQUW): gen_op = GEN_OP(vcmpequw); break;
case PPC_I(VCMPGTSB): gen_op = GEN_OP(vcmpgtsb); break;
case PPC_I(VCMPGTSH): gen_op = GEN_OP(vcmpgtsh); break;
case PPC_I(VCMPGTSW): gen_op = GEN_OP(vcmpgtsw); break;
case PPC_I(VOR): gen_op = GEN_OP(vor); break;
case PPC_I(VSUBUBM): gen_op = GEN_OP(vsububm); break;
case PPC_I(VSUBUHM): gen_op = GEN_OP(vsubuhm); break;
case PPC_I(VSUBUWM): gen_op = GEN_OP(vsubuwm); break;
case PPC_I(VXOR): gen_op = GEN_OP(vxor); break;
#undef GEN_OP
}
#ifdef HAVE_gen_op_sse_nop
if (gen_op.ptr())
return gen_op;
if (!(cpu_features & HWCAP_I386_SSE))
return 0;
/* XXX: is the MMX unit really used for those? */
switch (insn) {
#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_mmx_##NAME)
case PPC_I(VMAXSH): gen_op = GEN_OP(vmaxsh); break;
case PPC_I(VMAXUB): gen_op = GEN_OP(vmaxub); break;
case PPC_I(VMINSH): gen_op = GEN_OP(vminsh); break;
case PPC_I(VMINUB): gen_op = GEN_OP(vminub); break;
#undef GEN_OP
}
#endif
return gen_op;
#endif
return 0;
}
void powerpc_dyngen::gen_vmaddfp(int vD, int vA, int vB, int vC)
powerpc_dyngen::gen_handler_t
powerpc_dyngen::vector_codegen_sse(int insn)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_load_ad_V2_VR(vC);
gen_op_vmaddfp_VD_V0_V1_V2();
#ifdef HAVE_gen_op_sse_nop
if (!(cpu_features & HWCAP_I386_SSE))
return 0;
/* XXX: auto-generate the table with individual handlers */
gen_handler_t gen_op = 0;
switch (insn) {
#define GEN_OP(NAME) nv_mem_fun(&powerpc_dyngen::gen_op_sse_##NAME)
case PPC_I(VADDFP): gen_op = GEN_OP(vaddfp); break;
case PPC_I(VAND): gen_op = GEN_OP(vand); break;
case PPC_I(VANDC): gen_op = GEN_OP(vandc); break;
case PPC_I(VCMPEQFP): gen_op = GEN_OP(vcmpeqfp); break;
case PPC_I(VCMPGEFP): gen_op = GEN_OP(vcmpgefp); break;
case PPC_I(VCMPGTFP): gen_op = GEN_OP(vcmpgtfp); break;
case PPC_I(VMADDFP): gen_op = GEN_OP(vmaddfp); break;
case PPC_I(VMAXFP): gen_op = GEN_OP(vmaxfp); break;
case PPC_I(VMINFP): gen_op = GEN_OP(vminfp); break;
case PPC_I(VNMSUBFP): gen_op = GEN_OP(vnmsubfp); break;
case PPC_I(VOR): gen_op = GEN_OP(vor); break;
case PPC_I(VSUBFP): gen_op = GEN_OP(vsubfp); break;
case PPC_I(VXOR): gen_op = GEN_OP(vxor); break;
#undef GEN_OP
}
return gen_op;
#endif
return 0;
}
void powerpc_dyngen::gen_vnmsubfp(int vD, int vA, int vB, int vC)
powerpc_dyngen::gen_handler_t
powerpc_dyngen::vector_codegen_sse2(int insn)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_load_ad_V2_VR(vC);
gen_op_vnmsubfp_VD_V0_V1_V2();
return 0;
}
void powerpc_dyngen::gen_vmaxfp(int vD, int vA, int vB)
void powerpc_dyngen::gen_mmx_clear(void)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vmaxfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vminfp(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vminfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vand(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vand_VD_V0_V1();
}
void powerpc_dyngen::gen_vandc(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vandc_VD_V0_V1();
}
void powerpc_dyngen::gen_vnor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vnor_VD_V0_V1();
}
void powerpc_dyngen::gen_vor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vor_VD_V0_V1();
}
void powerpc_dyngen::gen_vxor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vxor_VD_V0_V1();
#ifdef HAVE_gen_op_mmx_nop
if (cpu_features & HWCAP_I386_MMX)
gen_op_emms();
#endif
}
#endif

View File

@ -22,6 +22,7 @@
#define PPC_DYNGEN_H
#include "sysdeps.h"
#include "nvmemfun.hpp"
#include "cpu/ppc/ppc-config.hpp"
#if PPC_ENABLE_JIT
@ -42,10 +43,11 @@ public:
// Make rc_cache accessible to codegen helper
friend class powerpc_dyngen_helper;
// Code generators
typedef nv_mem_fun_t< void, powerpc_dyngen > gen_handler_t;
// Default constructor
powerpc_dyngen(dyngen_cpu_base cpu, int cache_size = -1)
: basic_dyngen(cpu, cache_size)
{ }
powerpc_dyngen(dyngen_cpu_base cpu, int cache_size = -1);
// Load/store registers
void gen_load_A0_GPR(int i);
@ -228,17 +230,16 @@ public:
void gen_load_vect_VD_T0(int vD);
void gen_store_word_VS_T0(int vS);
void gen_store_vect_VS_T0(int vS);
void gen_vaddfp(int vD, int vA, int vB);
void gen_vsubfp(int vD, int vA, int vB);
void gen_vmaddfp(int vD, int vA, int vB, int vC);
void gen_vnmsubfp(int vD, int vA, int vB, int vC);
void gen_vmaxfp(int vD, int vA, int vB);
void gen_vminfp(int vD, int vA, int vB);
void gen_vand(int vD, int vA, int vB);
void gen_vandc(int vD, int vA, int vB);
void gen_vnor(int vD, int vA, int vB);
void gen_vor(int vD, int vA, int vB);
void gen_vxor(int vD, int vA, int vB);
DEFINE_ALIAS(record_cr6_VD,0);
// Code generators for AltiVec instructions
gen_handler_t vector_codegen(int insn);
#if defined(__i386__) || defined(__x86_64__)
gen_handler_t vector_codegen_mmx(int insn);
gen_handler_t vector_codegen_sse(int insn);
gen_handler_t vector_codegen_sse2(int insn);
void gen_mmx_clear(void);
#endif
#undef DEFINE_ALIAS
#undef DEFINE_ALIAS_0

View File

@ -210,8 +210,13 @@ union powerpc_vr
uint32 w[4];
uint64 j[2];
float f[4];
double d[2];
};
}
#if defined(__GNUC__)
// 16-byte alignment is required for SIMD optimizations operating on
// 128-bit aligned registers (e.g. SSE).
__attribute__((aligned(16)))
#endif
;
/**

View File

@ -1309,33 +1309,68 @@ powerpc_cpu::compile_block(uint32 entry_point)
break;
}
case PPC_I(VADDFP):
case PPC_I(VSUBFP):
case PPC_I(VMADDFP):
case PPC_I(VNMSUBFP):
case PPC_I(VMAXFP):
case PPC_I(VMINFP):
case PPC_I(VADDUBM):
case PPC_I(VADDUHM):
case PPC_I(VADDUWM):
case PPC_I(VAND):
case PPC_I(VANDC):
case PPC_I(VCMPEQFP):
case PPC_I(VCMPEQUB):
case PPC_I(VCMPEQUH):
case PPC_I(VCMPEQUW):
case PPC_I(VCMPGEFP):
case PPC_I(VCMPGTFP):
case PPC_I(VCMPGTSB):
case PPC_I(VCMPGTSH):
case PPC_I(VCMPGTSW):
case PPC_I(VMADDFP):
case PPC_I(VMAXFP):
case PPC_I(VMAXSH):
case PPC_I(VMAXUB):
case PPC_I(VMINFP):
case PPC_I(VMINSH):
case PPC_I(VMINUB):
case PPC_I(VNMSUBFP):
case PPC_I(VNOR):
case PPC_I(VOR):
case PPC_I(VSUBFP):
case PPC_I(VSUBUBM):
case PPC_I(VSUBUHM):
case PPC_I(VSUBUWM):
case PPC_I(VXOR):
{
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
const int vB = vB_field::extract(opcode);
switch (ii->mnemo) {
case PPC_I(VADDFP): dg.gen_vaddfp(vD, vA, vB); break;
case PPC_I(VSUBFP): dg.gen_vsubfp(vD, vA, vB); break;
case PPC_I(VMADDFP): dg.gen_vmaddfp(vD, vA, vB, vC_field::extract(opcode)); break;
case PPC_I(VNMSUBFP): dg.gen_vnmsubfp(vD, vA, vB, vC_field::extract(opcode)); break;
case PPC_I(VMAXFP): dg.gen_vmaxfp(vD, vA, vB); break;
case PPC_I(VMINFP): dg.gen_vminfp(vD, vA, vB); break;
case PPC_I(VAND): dg.gen_vand(vD, vA, vB); break;
case PPC_I(VANDC): dg.gen_vandc(vD, vA, vB); break;
case PPC_I(VNOR): dg.gen_vnor(vD, vA, vB); break;
case PPC_I(VOR): dg.gen_vor(vD, vA, vB); break;
case PPC_I(VXOR): dg.gen_vxor(vD, vA, vB); break;
powerpc_dyngen::gen_handler_t gen_op = 0;
#if defined(__i386__) || defined(__x86_64__)
/* XXX: analyze the block function */
bool mmx_used = false;
if ((gen_op = dg.vector_codegen_sse(ii->mnemo)).ptr()) {
/* SSE code generator available */
}
else if ((gen_op = dg.vector_codegen_mmx(ii->mnemo)).ptr()) {
/* MMX code generator available */
mmx_used = true;
}
else
#endif
gen_op = dg.vector_codegen(ii->mnemo);
if (!gen_op.ptr())
goto do_generic;
dg.gen_load_ad_VD_VR(vD_field::extract(opcode));
dg.gen_load_ad_V0_VR(vA_field::extract(opcode));
dg.gen_load_ad_V1_VR(vB_field::extract(opcode));
if (ii->format == VA_form)
dg.gen_load_ad_V2_VR(vC_field::extract(opcode));
gen_op(&dg);
if (ii->format == VXR_form && vRc_field::test(opcode))
dg.gen_record_cr6_VD();
#if defined(__i386__) || defined(__x86_64__)
if (mmx_used)
dg.gen_mmx_clear();
#endif
break;
}
default: // Direct call to instruction handler