Add SSSE3 optimizations (Intel Core 2 CPUs and newer) for LVX, STVX, VPERM.

This brings an overall +10% performance improvement in AltiVec Fractal Carbon.
This commit is contained in:
gbeauche 2008-01-01 21:51:56 +00:00
parent c578952735
commit b5746b4f68
4 changed files with 144 additions and 0 deletions

View File

@ -664,6 +664,18 @@ public:
#undef DEFINE_OP_IR #undef DEFINE_OP_IR
#undef DEFINE_OP_IRR #undef DEFINE_OP_IRR
#undef DEFINE_OP_IXR #undef DEFINE_OP_IXR
private:
void gen_ssse3_arith(int op1, int op2, int s, int d)
{ GEN_CODE(_SSSE3Lrr(op1, op2, s, _rX, d, _rX)); }
void gen_ssse3_arith(int op1, int op2, x86_memory_operand const & mem, int d)
{ GEN_CODE(_SSSE3Lmr(op1, op2, mem.MD, mem.MB, mem.MI, mem.MS, d, _rX)); }
void gen_ssse3_arith(int op1, int op2, x86_immediate_operand const & imm, int s, int d)
{ GEN_CODE(_SSSE3Lirr(op1, op2, imm.value, s, d)); }
void gen_ssse3_arith(int op1, int op2, x86_immediate_operand const & imm, x86_memory_operand const & mem, int d)
{ GEN_CODE(_SSSE3Limr(op1, op2, imm.value, mem.MD, mem.MB, mem.MI, mem.MS, d)); }
}; };
enum { enum {
@ -680,6 +692,7 @@ enum {
X86_INSN_SSE_PS, X86_INSN_SSE_PS,
X86_INSN_SSE_PD, X86_INSN_SSE_PD,
X86_INSN_SSE_PI, X86_INSN_SSE_PI,
X86_INSN_SSE_3P, /* 3-byte prefix (SSSE3) */
}; };
inline void inline void
@ -701,6 +714,9 @@ x86_codegen::gen_insn(int type, int op, int s, int d)
case X86_INSN_SSE_PI: case X86_INSN_SSE_PI:
gen_sse_arith(0x66, op, s, d); gen_sse_arith(0x66, op, s, d);
break; break;
case X86_INSN_SSE_3P:
gen_ssse3_arith(0x38, op, s, d);
break;
default: default:
abort(); abort();
} }
@ -725,6 +741,9 @@ x86_codegen::gen_insn(int type, int op, x86_memory_operand const & mem, int d)
case X86_INSN_SSE_PI: case X86_INSN_SSE_PI:
gen_sse_arith(0x66, op, mem, d); gen_sse_arith(0x66, op, mem, d);
break; break;
case X86_INSN_SSE_3P:
gen_ssse3_arith(0x38, op, mem, d);
break;
default: default:
abort(); abort();
} }

View File

@ -213,6 +213,23 @@ bool powerpc_jit::initialize(void)
for (int i = 0; i < sizeof(sse2_vector) / sizeof(sse2_vector[0]); i++) for (int i = 0; i < sizeof(sse2_vector) / sizeof(sse2_vector[0]); i++)
jit_info[sse2_vector[i].mnemo] = &sse2_vector[i]; jit_info[sse2_vector[i].mnemo] = &sse2_vector[i];
} }
// SSSE3 optimized handlers
static const jit_info_t ssse3_vector[] = {
#define DEFINE_OP(MNEMO, GEN_OP) \
{ PPC_I(MNEMO), (gen_handler_t)&powerpc_jit::gen_ssse3_##GEN_OP, }
DEFINE_OP(LVX, lvx),
DEFINE_OP(LVXL, lvx),
DEFINE_OP(STVX, stvx),
DEFINE_OP(STVXL, stvx),
DEFINE_OP(VPERM, vperm)
#undef DEFINE_OP
};
if (cpuinfo_check_ssse3()) {
for (int i = 0; i < sizeof(ssse3_vector) / sizeof(ssse3_vector[0]); i++)
jit_info[ssse3_vector[i].mnemo] = &ssse3_vector[i];
}
#endif #endif
} }
@ -819,4 +836,107 @@ bool powerpc_jit::gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB)
gen_sse2_vsplat(vD, REG_T0_ID); gen_sse2_vsplat(vD, REG_T0_ID);
return true; return true;
} }
/*
* SSSE3 optimizations
*/
uintptr powerpc_jit::gen_ssse3_vswap_mask(void)
{
// We must get the following bytes in memory
// 0x3 0x2 0x1 0x0 0x7 0x6 0x5 0x4 0xb 0xa 0x9 0x8 0xf 0xe 0xd 0xc
static uintptr control_mask = 0;
if (control_mask == 0) {
static const uint8 value[16] = {
0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04,
0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c
};
control_mask = (uintptr)copy_data(value, sizeof(value));
assert(control_mask <= 0xffffffff);
}
return control_mask;
}
// lvx, lvxl
bool powerpc_jit::gen_ssse3_lvx(int mnemo, int vD, int rA, int rB)
{
gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
if (rA != 0)
gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
x86_memory_operand vswapmask(gen_ssse3_vswap_mask(), X86_NOREG);
gen_movdqa(x86_memory_operand(0, REG_T0_ID), REG_V0_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, vswapmask, REG_V0_ID);
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
// stvx, stvxl
bool powerpc_jit::gen_ssse3_stvx(int mnemo, int vS, int rA, int rB)
{
gen_mov_32(x86_memory_operand(xPPC_GPR(rB), REG_CPU_ID), REG_T0_ID);
if (rA != 0)
gen_add_32(x86_memory_operand(xPPC_GPR(rA), REG_CPU_ID), REG_T0_ID);
gen_and_32(x86_immediate_operand(-16), REG_T0_ID);
x86_memory_operand vswapmask(gen_ssse3_vswap_mask(), X86_NOREG);
gen_movdqa(x86_memory_operand(xPPC_VR(vS), REG_CPU_ID), REG_V0_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, vswapmask, REG_V0_ID);
gen_movdqa(REG_V0_ID, x86_memory_operand(0, REG_T0_ID));
return true;
}
// vperm
bool powerpc_jit::gen_ssse3_vperm(int mnemo, int vD, int vA, int vB, int vC)
{
static uintptr zero_mask = 0;
if (zero_mask == 0) {
static const uint8 value[16] = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
};
zero_mask = (uintptr)copy_data(value, sizeof(value));
assert(zero_mask <= 0xffffffff);
}
static uintptr index_mask = 0;
if (index_mask == 0) {
static const uint8 value[16] = {
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f
};
index_mask = (uintptr)copy_data(value, sizeof(value));
assert(index_mask <= 0xffffffff);
};
/*
* PROP_IMSB(T) = T.index|most significant bit of T.index (T.bit0 = T.bit3)
* --> used to handle one vector at a time
*
* T.A = PSHUFB(PROP_IMSB(vC & IndexMask), vA);
* T.B = PSHUFB(PROP_IMSB(vC & IndexMaxk) ^ ZeroMask, vB);
* vD = T.A | T.B
*/
x86_memory_operand swap_mask(gen_ssse3_vswap_mask(), X86_NOREG);
gen_movdqa(x86_memory_operand(xPPC_VR(vC), REG_CPU_ID), REG_V2_ID);
gen_movdqa(swap_mask, REG_V3_ID);
gen_movdqa(x86_memory_operand(xPPC_VR(vA), REG_CPU_ID), REG_V0_ID);
gen_pand(x86_memory_operand(index_mask, X86_NOREG), REG_V2_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V3_ID, REG_V0_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V3_ID, REG_V2_ID);
gen_movdqa(REG_V2_ID, REG_V1_ID);
gen_psllq(x86_immediate_operand(3), REG_V2_ID);
gen_pand(x86_memory_operand(zero_mask, X86_NOREG), REG_V2_ID);
gen_por(REG_V2_ID, REG_V1_ID);
gen_movdqa(x86_memory_operand(xPPC_VR(vB), REG_CPU_ID), REG_V2_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V1_ID, REG_V0_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V3_ID, REG_V2_ID);
gen_pxor(x86_memory_operand(zero_mask, X86_NOREG), REG_V1_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V1_ID, REG_V2_ID);
gen_por(REG_V2_ID, REG_V0_ID);
gen_insn(X86_INSN_SSE_3P, X86_SSSE3_PSHUFB, REG_V3_ID, REG_V0_ID);
gen_movdqa(REG_V0_ID, x86_memory_operand(xPPC_VR(vD), REG_CPU_ID));
return true;
}
#endif #endif

View File

@ -96,6 +96,10 @@ private:
bool gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB); bool gen_sse2_vspltb(int mnemo, int vD, int UIMM, int vB);
bool gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB); bool gen_sse2_vsplth(int mnemo, int vD, int UIMM, int vB);
bool gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB); bool gen_sse2_vspltw(int mnemo, int vD, int UIMM, int vB);
uintptr gen_ssse3_vswap_mask(void);
bool gen_ssse3_lvx(int mnemo, int vD, int rA, int rB);
bool gen_ssse3_stvx(int mnemo, int vS, int rA, int rB);
bool gen_ssse3_vperm(int mnemo, int vD, int vA, int vB, int vC);
#endif #endif
}; };

View File

@ -1429,6 +1429,7 @@ powerpc_cpu::compile_block(uint32 entry_point)
break; break;
} }
case PPC_I(VSEL): case PPC_I(VSEL):
case PPC_I(VPERM):
case PPC_I(VMADDFP): case PPC_I(VMADDFP):
case PPC_I(VNMSUBFP): case PPC_I(VNMSUBFP):
{ {