diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4ed36906297..616171b2b69 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5458,7 +5458,12 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, return SDValue(); SDValue V = FirstNonZero.getOperand(0); - unsigned FirstNonZeroDst = cast(FirstNonZero.getOperand(1))->getZExtValue(); + MVT VVT = V.getSimpleValueType(); + if (VVT != MVT::v4f32 && VVT != MVT::v4i32) + return SDValue(); + + unsigned FirstNonZeroDst = + cast(FirstNonZero.getOperand(1))->getZExtValue(); unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; @@ -5498,8 +5503,8 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, else ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; - SDValue InsertpsMask = DAG.getIntPtrConstant( - ElementMoveMask | (~NonZeros & 0xf)); + SDValue InsertpsMask = + DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); } diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll index e472042b2ce..f407ba4cc16 100644 --- a/test/CodeGen/X86/avx-shuffle.ll +++ b/test/CodeGen/X86/avx-shuffle.ll @@ -314,3 +314,21 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { %1 = shufflevector <2 x i64> %i, <2 x i64> , <2 x i32> ret <2 x i64> %1 } + +;; Ensure we don't use insertps from non v4x32 vectors. +;; On SSE4.1 it works because bigger vectors use more than 1 register. +;; On AVX they get passed in a single register. +;; FIXME: We could probably optimize this case, if we're only using the +;; first 4 indices. +define <4 x i32> @insert_from_diff_size(<8 x i32> %x) { +; CHECK-LABEL: insert_from_diff_size: +; CHECK-NOT: insertps +; CHECK: ret + %vecext = extractelement <8 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %a.0 = extractelement <8 x i32> %x, i32 0 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3 + ret <4 x i32> %vecinit3 +}