mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-11-02 07:11:49 +00:00
Optimize some special cases for SSE4a insertqi
Summary: Since the upper 64 bits of the destination register are undefined when performing this operation, we can substitute it and let the optimizer figure out that only a copy is needed. Also added range merging, if an instruction copies a range that can be merged with a previous copied range. Added test cases for both optimizations. Reviewers: grosbach, nadav CC: llvm-commits Differential Revision: http://reviews.llvm.org/D3357 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207055 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
8bd9405026
commit
cd9f6b870e
@ -578,6 +578,73 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_sse4a_insertqi: {
|
||||
// insertqi x, y, 64, 0 can just copy y's lower bits and leave the top
|
||||
// ones undef
|
||||
// TODO: eventually we should lower this intrinsic to IR
|
||||
if (auto CIWidth = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
|
||||
if (auto CIStart = dyn_cast<ConstantInt>(II->getArgOperand(3))) {
|
||||
if (CIWidth->equalsInt(64) && CIStart->isZero()) {
|
||||
Value *Vec = II->getArgOperand(1);
|
||||
Value *Undef = UndefValue::get(Vec->getType());
|
||||
const uint32_t Mask[] = { 0, 2 };
|
||||
return ReplaceInstUsesWith(
|
||||
CI,
|
||||
Builder->CreateShuffleVector(
|
||||
Vec, Undef, ConstantDataVector::get(
|
||||
II->getContext(), ArrayRef<uint32_t>(Mask))));
|
||||
|
||||
} else if (auto Source =
|
||||
dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
|
||||
if (Source->hasOneUse() &&
|
||||
Source->getArgOperand(1) == II->getArgOperand(1)) {
|
||||
// If the source of the insert has only one use and it's another
|
||||
// insert (and they're both inserting from the same vector), try to
|
||||
// bundle both together.
|
||||
auto CISourceWidth =
|
||||
dyn_cast<ConstantInt>(Source->getArgOperand(2));
|
||||
auto CISourceStart =
|
||||
dyn_cast<ConstantInt>(Source->getArgOperand(3));
|
||||
if (CISourceStart && CISourceWidth) {
|
||||
unsigned Start = CIStart->getZExtValue();
|
||||
unsigned Width = CIWidth->getZExtValue();
|
||||
unsigned End = Start + Width;
|
||||
unsigned SourceStart = CISourceStart->getZExtValue();
|
||||
unsigned SourceWidth = CISourceWidth->getZExtValue();
|
||||
unsigned SourceEnd = SourceStart + SourceWidth;
|
||||
unsigned NewStart, NewWidth;
|
||||
bool ShouldReplace = false;
|
||||
if (Start <= SourceStart && SourceStart <= End) {
|
||||
NewStart = Start;
|
||||
NewWidth = std::max(End, SourceEnd) - NewStart;
|
||||
ShouldReplace = true;
|
||||
} else if (SourceStart <= Start && Start <= SourceEnd) {
|
||||
NewStart = SourceStart;
|
||||
NewWidth = std::max(SourceEnd, End) - NewStart;
|
||||
ShouldReplace = true;
|
||||
}
|
||||
|
||||
if (ShouldReplace) {
|
||||
Constant *ConstantWidth = ConstantInt::get(
|
||||
II->getArgOperand(2)->getType(), NewWidth, false);
|
||||
Constant *ConstantStart = ConstantInt::get(
|
||||
II->getArgOperand(3)->getType(), NewStart, false);
|
||||
Value *Args[4] = { Source->getArgOperand(0),
|
||||
II->getArgOperand(1), ConstantWidth,
|
||||
ConstantStart };
|
||||
Module *M = CI.getParent()->getParent()->getParent();
|
||||
Value *F =
|
||||
Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
|
||||
return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::x86_avx_vpermilvar_ps:
|
||||
case Intrinsic::x86_avx_vpermilvar_ps_256:
|
||||
case Intrinsic::x86_avx_vpermilvar_pd:
|
||||
|
@ -209,6 +209,103 @@ define <4 x float> @test_select(float %f, float %g) {
|
||||
ret <4 x float> %ret
|
||||
}
|
||||
|
||||
; We should optimize these two redundant insertqi into one
|
||||
; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
|
||||
; CHECK-NOT: insertqi
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; The result of this insert is the second arg, since the top 64 bits of
|
||||
; the result are undefined, and we copy the bottom 64 bits from the
|
||||
; second arg
|
||||
; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: ret <2 x i64> %i
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
|
||||
ret <2 x i64> %1
|
||||
}
|
||||
|
||||
; Test the several types of ranges and ordering that exist for two insertqi
|
||||
; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
|
||||
; CHECK: ret <2 x i64> %[[RES]]
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
|
||||
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
|
||||
define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
|
||||
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
|
||||
; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
|
||||
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
|
||||
%2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
|
||||
ret <2 x i64> %2
|
||||
}
|
||||
|
||||
|
||||
; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
|
||||
declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
|
||||
|
||||
declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
|
||||
define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
|
||||
; CHECK-LABEL: @test_vpermilvar_ps(
|
||||
|
Loading…
Reference in New Issue
Block a user