diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index cc43675360f..eafdf6e7b00 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -724,11 +724,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx_vpermilvar_pd_256: { // Convert vpermil* to shufflevector if the mask is constant. Value *V = II->getArgOperand(1); + unsigned Size = cast(V->getType())->getNumElements(); + assert(Size == 8 || Size == 4 || Size == 2); + uint32_t Indexes[8]; if (auto C = dyn_cast(V)) { - unsigned Size = C->getNumElements(); - assert(Size == 8 || Size == 4 || Size == 2); - uint32_t Indexes[8]; - // The intrinsics only read one or two bits, clear the rest. for (unsigned I = 0; I < Size; ++I) { uint32_t Index = C->getElementAsInteger(I) & 0x3; @@ -737,23 +736,26 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Index >>= 1; Indexes[I] = Index; } - - // The _256 variants are a bit trickier since the mask bits always index - // into the corresponding 128 half. In order to convert to a generic - // shuffle, we have to make that explicit. - if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 || - II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) { - for (unsigned I = Size / 2; I < Size; ++I) - Indexes[I] += Size / 2; - } - auto NewC = - ConstantDataVector::get(C->getContext(), makeArrayRef(Indexes, Size)); - auto V1 = II->getArgOperand(0); - auto V2 = UndefValue::get(V1->getType()); - auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); - return ReplaceInstUsesWith(CI, Shuffle); + } else if (isa(V)) { + for (unsigned I = 0; I < Size; ++I) + Indexes[I] = 0; + } else { + break; } - break; + // The _256 variants are a bit trickier since the mask bits always index + // into the corresponding 128 half. In order to convert to a generic + // shuffle, we have to make that explicit. + if (II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_ps_256 || + II->getIntrinsicID() == Intrinsic::x86_avx_vpermilvar_pd_256) { + for (unsigned I = Size / 2; I < Size; ++I) + Indexes[I] += Size / 2; + } + auto NewC = + ConstantDataVector::get(V->getContext(), makeArrayRef(Indexes, Size)); + auto V1 = II->getArgOperand(0); + auto V2 = UndefValue::get(V1->getType()); + auto Shuffle = Builder->CreateShuffleVector(V1, V2, NewC); + return ReplaceInstUsesWith(CI, Shuffle); } case Intrinsic::ppc_altivec_vperm: diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll index 9ae024181bc..41d2b292eef 100644 --- a/test/Transforms/InstCombine/vec_demanded_elts.ll +++ b/test/Transforms/InstCombine/vec_demanded_elts.ll @@ -339,6 +339,34 @@ define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { ret <4 x double> %a } +define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) { +; CHECK-LABEL: @test_vpermilvar_ps_zero( +; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer + %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer) + ret <4 x float> %a +} + +define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) { +; CHECK-LABEL: @test_vpermilvar_ps_256_zero( +; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> + %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer) + ret <8 x float> %a +} + +define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) { +; CHECK-LABEL: @test_vpermilvar_pd_zero( +; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> zeroinitializer) + ret <2 x double> %a +} + +define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) { +; CHECK-LABEL: @test_vpermilvar_pd_256_zero( +; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> + %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> zeroinitializer) + ret <4 x double> %a +} + define <2 x i64> @test_sse2_1() nounwind readnone uwtable { %S = bitcast i32 1 to i32 %1 = zext i32 %S to i64