Experiment with generic AltiVec optimizations for V4SF, V2DI operands (+60%)

This commit is contained in:
gbeauche 2004-02-16 23:17:27 +00:00
parent 680326da55
commit ea3c6801ab
8 changed files with 505 additions and 59 deletions

View File

@ -1711,7 +1711,7 @@ const powerpc_cpu::instr_info_t powerpc_cpu::powerpc_ii_table[] = {
{ "vnmsubfp",
EXECUTE_VECTOR_ARITH(vnmsubfp, V4SF, V4SF, V4SF, V4SF),
NULL,
PPC_I(VNMSUB),
PPC_I(VNMSUBFP),
VA_form, 4, 47, CFLOW_NORMAL
},
{ "vnor",

View File

@ -39,19 +39,24 @@ register struct powerpc_cpu *CPU asm(REG_CPU);
#define REG32(X) X
#endif
#define FPREG(X) ((powerpc_fpr *)(X))
#define VREG(X) ((powerpc_vr *)(X))[0]
#define A0 REG32(reg_A0)
#define VD VREG(reg_A0)
register uintptr reg_A0 asm(REG_A0);
#define T0 REG32(reg_T0)
#define F0 FPREG(reg_T0)->d
#define F0_dw FPREG(reg_T0)->j
#define V0 VREG(reg_T0)
register uintptr reg_T0 asm(REG_T0);
#define T1 REG32(reg_T1)
#define F1 FPREG(reg_T1)->d
#define F1_dw FPREG(reg_T1)->j
#define V1 VREG(reg_T1)
register uintptr reg_T1 asm(REG_T1);
#define T2 REG32(reg_T2)
#define F2 FPREG(reg_T2)->d
#define F2_dw FPREG(reg_T2)->j
#define V2 VREG(reg_T2)
register uintptr reg_T2 asm(REG_T2);
#define FD powerpc_dyngen_helper::fp_result()
#define FD_dw powerpc_dyngen_helper::fp_result_dw()
@ -1236,3 +1241,229 @@ void OPPROTO op_jump_next_A0(void)
}
dyngen_barrier();
}
/**
* Load/store addresses to vector registers
**/
#define reg_TD reg_A0
#define DEFINE_OP(REG, N) \
void OPPROTO op_load_ad_V##REG##_VR##N(void) \
{ \
reg_T##REG = (uintptr)&CPU->vr(N); \
}
#define DEFINE_REG(N) \
DEFINE_OP(D,N); \
DEFINE_OP(0,N); \
DEFINE_OP(1,N); \
DEFINE_OP(2,N); \
DEFINE_REG(0);
DEFINE_REG(1);
DEFINE_REG(2);
DEFINE_REG(3);
DEFINE_REG(4);
DEFINE_REG(5);
DEFINE_REG(6);
DEFINE_REG(7);
DEFINE_REG(8);
DEFINE_REG(9);
DEFINE_REG(10);
DEFINE_REG(11);
DEFINE_REG(12);
DEFINE_REG(13);
DEFINE_REG(14);
DEFINE_REG(15);
DEFINE_REG(16);
DEFINE_REG(17);
DEFINE_REG(18);
DEFINE_REG(19);
DEFINE_REG(20);
DEFINE_REG(21);
DEFINE_REG(22);
DEFINE_REG(23);
DEFINE_REG(24);
DEFINE_REG(25);
DEFINE_REG(26);
DEFINE_REG(27);
DEFINE_REG(28);
DEFINE_REG(29);
DEFINE_REG(30);
DEFINE_REG(31);
#undef DEFINE_REG
#undef DEFINE_OP
#undef reg_TD
void op_load_word_VD_T0(void)
{
const uint32 ea = T0;
VD.w[(ea >> 2) & 3] = vm_read_memory_4(ea & ~3);
}
void op_store_word_VD_T0(void)
{
const uint32 ea = T0;
vm_write_memory_4(ea & ~3, VD.w[(ea >> 2) & 3]);
}
void op_load_vect_VD_T0(void)
{
const uint32 ea = T0 & ~15;
VD.w[0] = vm_read_memory_4(ea + 0);
VD.w[1] = vm_read_memory_4(ea + 4);
VD.w[2] = vm_read_memory_4(ea + 8);
VD.w[3] = vm_read_memory_4(ea + 12);
}
void op_store_vect_VD_T0(void)
{
const uint32 ea = T0 & ~15;
vm_write_memory_4(ea + 0, VD.w[0]);
vm_write_memory_4(ea + 4, VD.w[1]);
vm_write_memory_4(ea + 8, VD.w[2]);
vm_write_memory_4(ea + 12, VD.w[3]);
}
/**
* Vector operations helpers
**/
struct VNONE {
typedef null_operand type;
static inline uint32 get(powerpc_vr const & v, int i) { return 0; }
static inline void set(powerpc_vr const & v, int i, uint32) { }
};
struct V16QI {
typedef uint8 type;
static inline type get(powerpc_vr const & v, int i) { return v.b[i]; }
static inline void set(powerpc_vr & v, int i, type x) { v.b[i] = x; }
};
struct V8HI {
typedef uint16 type;
static inline type get(powerpc_vr const & v, int i) { return v.h[i]; }
static inline void set(powerpc_vr & v, int i, type x) { v.h[i] = x; }
};
struct V4SI {
typedef uint32 type;
static inline type get(powerpc_vr const & v, int i) { return v.w[i]; }
static inline void set(powerpc_vr & v, int i, type x) { v.w[i] = x; }
};
struct V2DI {
typedef uint64 type;
static inline type get(powerpc_vr const & v, int i) { return v.j[i]; }
static inline void set(powerpc_vr & v, int i, type x) { v.j[i] = x; }
};
struct V4SF {
typedef float type;
static inline type get(powerpc_vr const & v, int i) { return v.f[i]; }
static inline void set(powerpc_vr & v, int i, type x) { v.f[i] = x; }
};
template< class OP, class VX, class VA, class VB, class VC, int N >
struct do_vector_execute {
static inline void apply() {
do_vector_execute<OP, VX, VA, VB, VC, N - 1>::apply();
VX::set(
VD, N,
op_apply<typename VX::type, OP, typename VA::type, typename VB::type, typename VC::type>::apply(
VA::get(V0, N),
VB::get(V1, N),
VC::get(V2, N)));
}
};
template< class OP, class VX, class VA, class VB, class VC >
struct do_vector_execute<OP, VX, VA, VB, VC, 0> {
static inline void apply() {
VX::set(
VD, 0, op_apply<typename VX::type, OP, typename VA::type, typename VB::type, typename VC::type>::apply(
VA::get(V0, 0),
VB::get(V1, 0),
VC::get(V2, 0)));
}
};
template< class OP, class VX, class VA, class VB = VNONE, class VC = VNONE >
struct vector_execute {
static inline void apply() {
do_vector_execute<OP, VX, VA, VB, VC, (16 / sizeof(typename VX::type)) - 1>::apply();
}
};
/**
* Vector synthetic operations
**/
void op_vaddfp_VD_V0_V1(void)
{
vector_execute<op_fadds, V4SF, V4SF, V4SF>::apply();
}
void op_vsubfp_VD_V0_V1(void)
{
vector_execute<op_fsubs, V4SF, V4SF, V4SF>::apply();
}
void op_vmaddfp_VD_V0_V1_V2(void)
{
vector_execute<op_vmaddfp, V4SF, V4SF, V4SF, V4SF>::apply();
}
void op_vnmsubfp_VD_V0_V1_V2(void)
{
vector_execute<op_vnmsubfp, V4SF, V4SF, V4SF, V4SF>::apply();
}
void op_vmaxfp_VD_V0_V1(void)
{
vector_execute<op_max<float>, V4SF, V4SF, V4SF>::apply();
}
void op_vminfp_VD_V0_V1(void)
{
vector_execute<op_min<float>, V4SF, V4SF, V4SF>::apply();
}
void op_vand_VD_V0_V1(void)
{
vector_execute<op_and_64, V2DI, V2DI, V2DI>::apply();
}
void op_vandc_VD_V0_V1(void)
{
vector_execute<op_andc_64, V2DI, V2DI, V2DI>::apply();
}
void op_vnor_VD_V0_V1(void)
{
vector_execute<op_nor_64, V2DI, V2DI, V2DI>::apply();
}
void op_vor_VD_V0_V1(void)
{
vector_execute<op_or_64, V2DI, V2DI, V2DI>::apply();
}
void op_vxor_VD_V0_V1(void)
{
vector_execute<op_xor_64, V2DI, V2DI, V2DI>::apply();
}
#ifdef LONG_OPERATIONS
void op_vcmpeqfp_VD_V0_V1(void)
{
vector_execute<op_cmp_eq<float>, V4SF, V4SF, V4SF>::apply();
}
void op_vaddubm_VD_V0_V1(void)
{
vector_execute<op_template_add<uint8>, V16QI, V16QI, V16QI>::apply();
}
#endif

View File

@ -125,6 +125,10 @@ DEFINE_INSN(store, F0, FPR);
DEFINE_INSN(store, F1, FPR);
DEFINE_INSN(store, F2, FPR);
DEFINE_INSN(store, FD, FPR);
DEFINE_INSN(load_ad, VD, VR);
DEFINE_INSN(load_ad, V0, VR);
DEFINE_INSN(load_ad, V1, VR);
DEFINE_INSN(load_ad, V2, VR);
// Condition register bitfield
DEFINE_INSN(load, T0, crb);
@ -234,3 +238,121 @@ void powerpc_dyngen::gen_bc_A0(int bo, int bi, uint32 npc)
}
#endif
}
/**
* Vector instructions
**/
void powerpc_dyngen::gen_load_word_VD_T0(int vD)
{
gen_load_ad_VD_VR(vD);
gen_op_load_word_VD_T0();
}
void powerpc_dyngen::gen_store_word_VS_T0(int vS)
{
gen_load_ad_VD_VR(vS);
gen_op_store_word_VD_T0();
}
void powerpc_dyngen::gen_load_vect_VD_T0(int vD)
{
gen_load_ad_VD_VR(vD);
gen_op_load_vect_VD_T0();
}
void powerpc_dyngen::gen_store_vect_VS_T0(int vS)
{
gen_load_ad_VD_VR(vS);
gen_op_store_vect_VD_T0();
}
void powerpc_dyngen::gen_vaddfp(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vaddfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vsubfp(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vsubfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vmaddfp(int vD, int vA, int vB, int vC)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_load_ad_V2_VR(vC);
gen_op_vmaddfp_VD_V0_V1_V2();
}
void powerpc_dyngen::gen_vnmsubfp(int vD, int vA, int vB, int vC)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_load_ad_V2_VR(vC);
gen_op_vnmsubfp_VD_V0_V1_V2();
}
void powerpc_dyngen::gen_vmaxfp(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vmaxfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vminfp(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vminfp_VD_V0_V1();
}
void powerpc_dyngen::gen_vand(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vand_VD_V0_V1();
}
void powerpc_dyngen::gen_vandc(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vandc_VD_V0_V1();
}
void powerpc_dyngen::gen_vnor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vnor_VD_V0_V1();
}
void powerpc_dyngen::gen_vor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vor_VD_V0_V1();
}
void powerpc_dyngen::gen_vxor(int vD, int vA, int vB)
{
gen_load_ad_VD_VR(vD);
gen_load_ad_V0_VR(vA);
gen_load_ad_V1_VR(vB);
gen_op_vxor_VD_V0_V1();
}

View File

@ -219,6 +219,27 @@ public:
// Branch instructions
void gen_bc_A0(int bo, int bi, uint32 npc);
// Vector instructions
void gen_load_ad_VD_VR(int i);
void gen_load_ad_V0_VR(int i);
void gen_load_ad_V1_VR(int i);
void gen_load_ad_V2_VR(int i);
void gen_load_word_VD_T0(int vD);
void gen_load_vect_VD_T0(int vD);
void gen_store_word_VS_T0(int vS);
void gen_store_vect_VS_T0(int vS);
void gen_vaddfp(int vD, int vA, int vB);
void gen_vsubfp(int vD, int vA, int vB);
void gen_vmaddfp(int vD, int vA, int vB, int vC);
void gen_vnmsubfp(int vD, int vA, int vB, int vC);
void gen_vmaxfp(int vD, int vA, int vB);
void gen_vminfp(int vD, int vA, int vB);
void gen_vand(int vD, int vA, int vB);
void gen_vandc(int vD, int vA, int vB);
void gen_vnor(int vD, int vA, int vB);
void gen_vor(int vD, int vA, int vB);
void gen_vxor(int vD, int vA, int vB);
#undef DEFINE_ALIAS
#undef DEFINE_ALIAS_0
#undef DEFINE_ALIAS_1

View File

@ -43,63 +43,6 @@
#define DEBUG 0
#include "debug.h"
/**
* Helper class to apply an unary/binary/trinary operation
*
* OP Operation to perform
* RA Input operand register
* RB Input operand register or immediate (optional: operand_NONE)
* RC Input operand register or immediate (optional: operand_NONE)
**/
template< class RT, class OP, class RA, class RB, class RC >
struct op_apply {
template< class A, class B, class C >
static inline RT apply(A a, B b, C c) {
return OP::apply(a, b, c);
}
};
template< class RT, class OP, class RA, class RB >
struct op_apply<RT, OP, RA, RB, null_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B b, C) {
return OP::apply(a, b);
}
};
template< class RT, class OP, class RA >
struct op_apply<RT, OP, RA, null_operand, null_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B, C) {
return OP::apply(a);
}
};
template< class RT, class OP, class RA, class RB >
struct op_apply<RT, OP, RA, RB, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B b, C) {
return (RT)OP::apply(a, b);
}
};
template< class RT, class OP, class RA >
struct op_apply<RT, OP, RA, null_vector_operand, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B, C) {
return (RT)OP::apply(a);
}
};
template< class RT, class OP, class RB >
struct op_apply<RT, OP, null_vector_operand, RB, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A, B b, C) {
return (RT)OP::apply(b);
}
};
/**
* Illegal & NOP instructions
**/

View File

@ -36,6 +36,66 @@
template< bool SB > struct register_value { typedef uint32 type; };
template< > struct register_value< true > { typedef int32 type; };
/**
* Helper class to apply an unary/binary/trinary operation
*
* OP Operation to perform
* RA Input operand register
* RB Input operand register or immediate (optional: operand_NONE)
* RC Input operand register or immediate (optional: operand_NONE)
**/
struct null_operand;
struct null_vector_operand;
template< class RT, class OP, class RA, class RB, class RC >
struct op_apply {
template< class A, class B, class C >
static inline RT apply(A a, B b, C c) {
return OP::apply(a, b, c);
}
};
template< class RT, class OP, class RA, class RB >
struct op_apply<RT, OP, RA, RB, null_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B b, C) {
return OP::apply(a, b);
}
};
template< class RT, class OP, class RA >
struct op_apply<RT, OP, RA, null_operand, null_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B, C) {
return OP::apply(a);
}
};
template< class RT, class OP, class RA, class RB >
struct op_apply<RT, OP, RA, RB, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B b, C) {
return (RT)OP::apply(a, b);
}
};
template< class RT, class OP, class RA >
struct op_apply<RT, OP, RA, null_vector_operand, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A a, B, C) {
return (RT)OP::apply(a);
}
};
template< class RT, class OP, class RB >
struct op_apply<RT, OP, null_vector_operand, RB, null_vector_operand> {
template< class A, class B, class C >
static inline RT apply(A, B b, C) {
return (RT)OP::apply(b);
}
};
/**
* Add instruction templates
**/

View File

@ -290,7 +290,7 @@ enum powerpc_instruction {
PPC_I(VMULOSH),
PPC_I(VMULOUB),
PPC_I(VMULOUH),
PPC_I(VNMSUB),
PPC_I(VNMSUBFP),
PPC_I(VNOR),
PPC_I(VOR),
PPC_I(VPERM),

View File

@ -1269,6 +1269,75 @@ powerpc_cpu::compile_block(uint32 entry_point)
break;
}
#endif
// NOTE: A0/VD are clobbered in the following instructions!
case PPC_I(LVEWX):
case PPC_I(LVX):
case PPC_I(LVXL):
{
const int rA = rA_field::extract(opcode);
const int rB = rB_field::extract(opcode);
const int vD = vD_field::extract(opcode);
dg.gen_load_T0_GPR(rB);
if (rA != 0) {
dg.gen_load_T1_GPR(rA);
dg.gen_add_32_T0_T1();
}
switch (ii->mnemo) {
case PPC_I(LVEWX): dg.gen_load_word_VD_T0(vD); break;
case PPC_I(LVX): dg.gen_load_vect_VD_T0(vD); break;
case PPC_I(LVXL): dg.gen_load_vect_VD_T0(vD); break;
}
break;
}
case PPC_I(STVEWX):
case PPC_I(STVX):
case PPC_I(STVXL):
{
const int rA = rA_field::extract(opcode);
const int rB = rB_field::extract(opcode);
const int vS = vS_field::extract(opcode);
dg.gen_load_T0_GPR(rB);
if (rA != 0) {
dg.gen_load_T1_GPR(rA);
dg.gen_add_32_T0_T1();
}
switch (ii->mnemo) {
case PPC_I(STVEWX): dg.gen_store_word_VS_T0(vS); break;
case PPC_I(STVX): dg.gen_store_vect_VS_T0(vS); break;
case PPC_I(STVXL): dg.gen_store_vect_VS_T0(vS); break;
}
break;
}
case PPC_I(VADDFP):
case PPC_I(VSUBFP):
case PPC_I(VMADDFP):
case PPC_I(VNMSUBFP):
case PPC_I(VMAXFP):
case PPC_I(VMINFP):
case PPC_I(VAND):
case PPC_I(VANDC):
case PPC_I(VNOR):
case PPC_I(VOR):
case PPC_I(VXOR):
{
const int vD = vD_field::extract(opcode);
const int vA = vA_field::extract(opcode);
const int vB = vB_field::extract(opcode);
switch (ii->mnemo) {
case PPC_I(VADDFP): dg.gen_vaddfp(vD, vA, vB); break;
case PPC_I(VSUBFP): dg.gen_vsubfp(vD, vA, vB); break;
case PPC_I(VMADDFP): dg.gen_vmaddfp(vD, vA, vB, vC_field::extract(opcode)); break;
case PPC_I(VNMSUBFP): dg.gen_vnmsubfp(vD, vA, vB, vC_field::extract(opcode)); break;
case PPC_I(VMAXFP): dg.gen_vmaxfp(vD, vA, vB); break;
case PPC_I(VMINFP): dg.gen_vminfp(vD, vA, vB); break;
case PPC_I(VAND): dg.gen_vand(vD, vA, vB); break;
case PPC_I(VANDC): dg.gen_vandc(vD, vA, vB); break;
case PPC_I(VNOR): dg.gen_vnor(vD, vA, vB); break;
case PPC_I(VOR): dg.gen_vor(vD, vA, vB); break;
case PPC_I(VXOR): dg.gen_vxor(vD, vA, vB); break;
}
break;
}
default: // Direct call to instruction handler
{
typedef void (*func_t)(dyngen_cpu_base, uint32);