diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index f7ebb20b9fd..5d4b063d5e5 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -578,6 +578,73 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_sse4a_insertqi: { + // insertqi x, y, 64, 0 can just copy y's lower bits and leave the top + // ones undef + // TODO: eventually we should lower this intrinsic to IR + if (auto CIWidth = dyn_cast(II->getArgOperand(2))) { + if (auto CIStart = dyn_cast(II->getArgOperand(3))) { + if (CIWidth->equalsInt(64) && CIStart->isZero()) { + Value *Vec = II->getArgOperand(1); + Value *Undef = UndefValue::get(Vec->getType()); + const uint32_t Mask[] = { 0, 2 }; + return ReplaceInstUsesWith( + CI, + Builder->CreateShuffleVector( + Vec, Undef, ConstantDataVector::get( + II->getContext(), ArrayRef(Mask)))); + + } else if (auto Source = + dyn_cast(II->getArgOperand(0))) { + if (Source->hasOneUse() && + Source->getArgOperand(1) == II->getArgOperand(1)) { + // If the source of the insert has only one use and it's another + // insert (and they're both inserting from the same vector), try to + // bundle both together. + auto CISourceWidth = + dyn_cast(Source->getArgOperand(2)); + auto CISourceStart = + dyn_cast(Source->getArgOperand(3)); + if (CISourceStart && CISourceWidth) { + unsigned Start = CIStart->getZExtValue(); + unsigned Width = CIWidth->getZExtValue(); + unsigned End = Start + Width; + unsigned SourceStart = CISourceStart->getZExtValue(); + unsigned SourceWidth = CISourceWidth->getZExtValue(); + unsigned SourceEnd = SourceStart + SourceWidth; + unsigned NewStart, NewWidth; + bool ShouldReplace = false; + if (Start <= SourceStart && SourceStart <= End) { + NewStart = Start; + NewWidth = std::max(End, SourceEnd) - NewStart; + ShouldReplace = true; + } else if (SourceStart <= Start && Start <= SourceEnd) { + NewStart = SourceStart; + NewWidth = std::max(SourceEnd, End) - NewStart; + ShouldReplace = true; + } + + if (ShouldReplace) { + Constant *ConstantWidth = ConstantInt::get( + II->getArgOperand(2)->getType(), NewWidth, false); + Constant *ConstantStart = ConstantInt::get( + II->getArgOperand(3)->getType(), NewStart, false); + Value *Args[4] = { Source->getArgOperand(0), + II->getArgOperand(1), ConstantWidth, + ConstantStart }; + Module *M = CI.getParent()->getParent()->getParent(); + Value *F = + Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + return ReplaceInstUsesWith(CI, Builder->CreateCall(F, Args)); + } + } + } + } + } + } + break; + } + case Intrinsic::x86_avx_vpermilvar_ps: case Intrinsic::x86_avx_vpermilvar_ps_256: case Intrinsic::x86_avx_vpermilvar_pd: diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index f75ea032735..70370196e15 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -209,6 +209,103 @@ define <4 x float> @test_select(float %f, float %g) { ret <4 x float> %ret } +; We should optimize these two redundant insertqi into one +; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) { +; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) +; CHECK-NOT: insertqi + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32) + ret <2 x i64> %2 +} + +; The result of this insert is the second arg, since the top 64 bits of +; the result are undefined, and we copy the bottom 64 bits from the +; second arg +; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) { +; CHECK: ret <2 x i64> %i + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0) + ret <2 x i64> %1 +} + +; Test the several types of ranges and ordering that exist for two insertqi +; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0) +; CHECK: ret <2 x i64> %[[RES]] + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + +; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) +define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) { +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) +; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0) + %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32) + ret <2 x i64> %2 +} + + +; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi +declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind + declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { ; CHECK-LABEL: @test_vpermilvar_ps(