mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 16:33:28 +00:00
Properly qualify AVX2 specific parts of execution dependency table. Also enable converting between 256-bit PS/PD operations when AVX1 is enabled. Fixes PR11370.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144622 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
283b419aea
commit
4c077a1f04
@ -3555,7 +3555,11 @@ static const unsigned ReplaceableInstrs[][3] = {
|
|||||||
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
|
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
|
||||||
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
|
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
|
||||||
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
|
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
|
||||||
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
|
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }
|
||||||
|
};
|
||||||
|
|
||||||
|
static const unsigned ReplaceableInstrsAVX2[][3] = {
|
||||||
|
//PackedSingle PackedDouble PackedInt
|
||||||
{ X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
|
{ X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
|
||||||
{ X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
|
{ X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
|
||||||
{ X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
|
{ X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
|
||||||
@ -3563,31 +3567,37 @@ static const unsigned ReplaceableInstrs[][3] = {
|
|||||||
{ X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
|
{ X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
|
||||||
{ X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
|
{ X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
|
||||||
{ X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
|
{ X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
|
||||||
{ X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
|
{ X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }
|
||||||
};
|
};
|
||||||
|
|
||||||
// FIXME: Some shuffle and unpack instructions have equivalents in different
|
// FIXME: Some shuffle and unpack instructions have equivalents in different
|
||||||
// domains, but they require a bit more work than just switching opcodes.
|
// domains, but they require a bit more work than just switching opcodes.
|
||||||
|
|
||||||
static const unsigned *lookup(unsigned opcode, unsigned domain) {
|
static const unsigned *lookup(unsigned opcode, unsigned domain, bool hasAVX2) {
|
||||||
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
|
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
|
||||||
if (ReplaceableInstrs[i][domain-1] == opcode)
|
if (ReplaceableInstrs[i][domain-1] == opcode)
|
||||||
return ReplaceableInstrs[i];
|
return ReplaceableInstrs[i];
|
||||||
|
if (domain != 3 || hasAVX2) // only use PackedInt domain if AVX2 is enabled
|
||||||
|
for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
|
||||||
|
if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
|
||||||
|
return ReplaceableInstrsAVX2[i];
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<uint16_t, uint16_t>
|
std::pair<uint16_t, uint16_t>
|
||||||
X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
|
X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
|
||||||
uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
||||||
|
bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
|
||||||
return std::make_pair(domain,
|
return std::make_pair(domain,
|
||||||
domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
|
domain && lookup(MI->getOpcode(), domain, hasAVX2) ? 0xe : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
|
void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
|
||||||
assert(Domain>0 && Domain<4 && "Invalid execution domain");
|
assert(Domain>0 && Domain<4 && "Invalid execution domain");
|
||||||
uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
|
||||||
assert(dom && "Not an SSE instruction");
|
assert(dom && "Not an SSE instruction");
|
||||||
const unsigned *table = lookup(MI->getOpcode(), dom);
|
bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
|
||||||
|
const unsigned *table = lookup(MI->getOpcode(), dom, hasAVX2);
|
||||||
assert(table && "Cannot change domain");
|
assert(table && "Cannot change domain");
|
||||||
MI->setDesc(get(table[Domain-1]));
|
MI->setDesc(get(table[Domain-1]));
|
||||||
}
|
}
|
||||||
|
@ -140,10 +140,7 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
|
|||||||
PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
|
PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
|
||||||
ShouldPrint = true;
|
ShouldPrint = true;
|
||||||
}
|
}
|
||||||
if (Subtarget.hasAVX2()) {
|
if (Subtarget.hasAVX()) {
|
||||||
// FIXME this should be turned on for just AVX, but the pass doesn't check
|
|
||||||
// that instructions are valid before replacing them and there are AVX2
|
|
||||||
// integer instructions in the table.
|
|
||||||
PM.add(createExecutionDependencyFixPass(&X86::VR256RegClass));
|
PM.add(createExecutionDependencyFixPass(&X86::VR256RegClass));
|
||||||
ShouldPrint = true;
|
ShouldPrint = true;
|
||||||
}
|
}
|
||||||
|
@ -2021,7 +2021,9 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
|
|||||||
|
|
||||||
define <32 x i8> @test_x86_avx_loadu_dq_256(i8* %a0) {
|
define <32 x i8> @test_x86_avx_loadu_dq_256(i8* %a0) {
|
||||||
; CHECK: vmovdqu
|
; CHECK: vmovdqu
|
||||||
%res = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
|
%a1 = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
|
||||||
|
; add operation forces the execution domain.
|
||||||
|
%res = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||||
ret <32 x i8> %res
|
ret <32 x i8> %res
|
||||||
}
|
}
|
||||||
declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
|
declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
|
||||||
@ -2029,7 +2031,9 @@ declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
|
|||||||
|
|
||||||
define <4 x double> @test_x86_avx_loadu_pd_256(i8* %a0) {
|
define <4 x double> @test_x86_avx_loadu_pd_256(i8* %a0) {
|
||||||
; CHECK: vmovupd
|
; CHECK: vmovupd
|
||||||
%res = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
|
%a1 = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
|
||||||
|
; add operation forces the execution domain.
|
||||||
|
%res = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
ret <4 x double> %res
|
ret <4 x double> %res
|
||||||
}
|
}
|
||||||
declare <4 x double> @llvm.x86.avx.loadu.pd.256(i8*) nounwind readonly
|
declare <4 x double> @llvm.x86.avx.loadu.pd.256(i8*) nounwind readonly
|
||||||
@ -2157,7 +2161,9 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
|
|||||||
|
|
||||||
define void @test_x86_avx_movnt_dq_256(i8* %a0, <4 x i64> %a1) {
|
define void @test_x86_avx_movnt_dq_256(i8* %a0, <4 x i64> %a1) {
|
||||||
; CHECK: vmovntdq
|
; CHECK: vmovntdq
|
||||||
call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a1)
|
; add operation forces the execution domain.
|
||||||
|
%a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
|
||||||
|
call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a2)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
|
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
|
||||||
@ -2165,7 +2171,8 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
|
|||||||
|
|
||||||
define void @test_x86_avx_movnt_pd_256(i8* %a0, <4 x double> %a1) {
|
define void @test_x86_avx_movnt_pd_256(i8* %a0, <4 x double> %a1) {
|
||||||
; CHECK: vmovntpd
|
; CHECK: vmovntpd
|
||||||
call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a1)
|
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a2)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
|
declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
|
||||||
@ -2258,7 +2265,9 @@ declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
|
|||||||
|
|
||||||
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
|
||||||
; CHECK: vmovdqu
|
; CHECK: vmovdqu
|
||||||
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a1)
|
; add operation forces the execution domain.
|
||||||
|
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
|
||||||
|
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
||||||
@ -2266,7 +2275,9 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
|
|||||||
|
|
||||||
define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
|
define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
|
||||||
; CHECK: vmovupd
|
; CHECK: vmovupd
|
||||||
call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a1)
|
; add operation forces the execution domain.
|
||||||
|
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
|
declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
|
||||||
|
@ -7,7 +7,9 @@ entry:
|
|||||||
%1 = bitcast <4 x double> %y to <4 x i64>
|
%1 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%and.i = and <4 x i64> %0, %1
|
%and.i = and <4 x i64> %0, %1
|
||||||
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
||||||
ret <4 x double> %2
|
; add forces execution domain
|
||||||
|
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %3
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vandpd LCP{{.*}}(%rip)
|
; CHECK: vandpd LCP{{.*}}(%rip)
|
||||||
@ -16,7 +18,9 @@ entry:
|
|||||||
%0 = bitcast <4 x double> %y to <4 x i64>
|
%0 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
%and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
||||||
%1 = bitcast <4 x i64> %and.i to <4 x double>
|
%1 = bitcast <4 x i64> %and.i to <4 x double>
|
||||||
ret <4 x double> %1
|
; add forces execution domain
|
||||||
|
%2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vandps
|
; CHECK: vandps
|
||||||
@ -45,7 +49,9 @@ entry:
|
|||||||
%1 = bitcast <4 x double> %y to <4 x i64>
|
%1 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%xor.i = xor <4 x i64> %0, %1
|
%xor.i = xor <4 x i64> %0, %1
|
||||||
%2 = bitcast <4 x i64> %xor.i to <4 x double>
|
%2 = bitcast <4 x i64> %xor.i to <4 x double>
|
||||||
ret <4 x double> %2
|
; add forces execution domain
|
||||||
|
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %3
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vxorpd LCP{{.*}}(%rip)
|
; CHECK: vxorpd LCP{{.*}}(%rip)
|
||||||
@ -54,7 +60,9 @@ entry:
|
|||||||
%0 = bitcast <4 x double> %y to <4 x i64>
|
%0 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
%xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
||||||
%1 = bitcast <4 x i64> %xor.i to <4 x double>
|
%1 = bitcast <4 x i64> %xor.i to <4 x double>
|
||||||
ret <4 x double> %1
|
; add forces execution domain
|
||||||
|
%2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vxorps
|
; CHECK: vxorps
|
||||||
@ -83,7 +91,9 @@ entry:
|
|||||||
%1 = bitcast <4 x double> %y to <4 x i64>
|
%1 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%or.i = or <4 x i64> %0, %1
|
%or.i = or <4 x i64> %0, %1
|
||||||
%2 = bitcast <4 x i64> %or.i to <4 x double>
|
%2 = bitcast <4 x i64> %or.i to <4 x double>
|
||||||
ret <4 x double> %2
|
; add forces execution domain
|
||||||
|
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %3
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vorpd LCP{{.*}}(%rip)
|
; CHECK: vorpd LCP{{.*}}(%rip)
|
||||||
@ -92,7 +102,9 @@ entry:
|
|||||||
%0 = bitcast <4 x double> %y to <4 x i64>
|
%0 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
%or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
|
||||||
%1 = bitcast <4 x i64> %or.i to <4 x double>
|
%1 = bitcast <4 x i64> %or.i to <4 x double>
|
||||||
ret <4 x double> %1
|
; add forces execution domain
|
||||||
|
%2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %2
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vorps
|
; CHECK: vorps
|
||||||
@ -122,7 +134,9 @@ entry:
|
|||||||
%1 = bitcast <4 x double> %y to <4 x i64>
|
%1 = bitcast <4 x double> %y to <4 x i64>
|
||||||
%and.i = and <4 x i64> %1, %neg.i
|
%and.i = and <4 x i64> %1, %neg.i
|
||||||
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
||||||
ret <4 x double> %2
|
; add forces execution domain
|
||||||
|
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %3
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vandnpd (%
|
; CHECK: vandnpd (%
|
||||||
@ -134,7 +148,9 @@ entry:
|
|||||||
%1 = bitcast <4 x double> %tmp2 to <4 x i64>
|
%1 = bitcast <4 x double> %tmp2 to <4 x i64>
|
||||||
%and.i = and <4 x i64> %1, %neg.i
|
%and.i = and <4 x i64> %1, %neg.i
|
||||||
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
%2 = bitcast <4 x i64> %and.i to <4 x double>
|
||||||
ret <4 x double> %2
|
; add forces execution domain
|
||||||
|
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
|
||||||
|
ret <4 x double> %3
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: vandnps
|
; CHECK: vandnps
|
||||||
|
Loading…
x
Reference in New Issue
Block a user