diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8650154c1a9..aa35f6d3e77 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3200,6 +3200,60 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return N0; if (ISD::isBuildVectorAllOnes(N1.getNode())) return N1; + + // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1) + // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2) + // Do this only if the resulting shuffle is legal. + if (isa(N0) && + isa(N1) && + N0->getOperand(1) == N1->getOperand(1) && + ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) { + bool CanFold = true; + unsigned NumElts = VT.getVectorNumElements(); + const ShuffleVectorSDNode *SV0 = cast(N0); + const ShuffleVectorSDNode *SV1 = cast(N1); + // We construct two shuffle masks: + // - Mask1 is a shuffle mask for a shuffle with N0 as the first operand + // and N1 as the second operand. + // - Mask2 is a shuffle mask for a shuffle with N1 as the first operand + // and N0 as the second operand. + // We do this because OR is commutable and therefore there might be + // two ways to fold this node into a shuffle. + SmallVector Mask1; + SmallVector Mask2; + + for (unsigned i = 0; i != NumElts && CanFold; ++i) { + int M0 = SV0->getMaskElt(i); + int M1 = SV1->getMaskElt(i); + + // Both shuffle indexes are undef. Propagate Undef. + if (M0 < 0 && M1 < 0) { + Mask1.push_back(M0); + Mask2.push_back(M0); + continue; + } + + if (M0 < 0 || M1 < 0 || + (M0 < (int)NumElts && M1 < (int)NumElts) || + (M0 >= (int)NumElts && M1 >= (int)NumElts)) { + CanFold = false; + break; + } + + Mask1.push_back(M0 < (int)NumElts ? M0 : M1 + NumElts); + Mask2.push_back(M1 < (int)NumElts ? M1 : M0 + NumElts); + } + + if (CanFold) { + // Fold this sequence only if the resulting shuffle is 'legal'. + if (TLI.isShuffleMaskLegal(Mask1, VT)) + return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), + N1->getOperand(0), &Mask1[0]); + if (TLI.isShuffleMaskLegal(Mask2, VT)) + return DAG.getVectorShuffle(VT, SDLoc(N), N1->getOperand(0), + N0->getOperand(0), &Mask2[0]); + } + } } // fold (or x, undef) -> -1 diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll new file mode 100644 index 00000000000..60b6d756165 --- /dev/null +++ b/test/CodeGen/X86/combine-or.ll @@ -0,0 +1,267 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s + + +; Verify that each of the following test cases is folded into a single +; instruction which performs a blend operation. + +define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test1 +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK-NOT: orps +; CHECK: ret + + +define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test2 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK: ret + + +define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test3 +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK-NEXT: ret + + +define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test4 +; CHECK-NOT: xorps +; CHECK: movss +; CHECK-NOT: orps +; CHECK: ret + + +define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test5 +; CHECK-NOT: xorps +; CHECK: movss +; CHECK-NEXT: ret + + +define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test6 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK-NEXT: ret + + +define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { + %and1 = and <4 x i32> %a, + %and2 = and <4 x i32> %b, + %or = or <4 x i32> %and1, %and2 + ret <4 x i32> %or +} +; CHECK-LABEL: test7 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK-NEXT: ret + + +define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) { + %and1 = and <2 x i64> %a, + %and2 = and <2 x i64> %b, + %or = or <2 x i64> %and1, %and2 + ret <2 x i64> %or +} +; CHECK-LABEL: test8 +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK-NOT: orps +; CHECK: ret + + +define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { + %and1 = and <4 x i32> %a, + %and2 = and <4 x i32> %b, + %or = or <4 x i32> %and1, %and2 + ret <4 x i32> %or +} +; CHECK-LABEL: test9 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK: ret + + +define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { + %and1 = and <2 x i64> %a, + %and2 = and <2 x i64> %b, + %or = or <2 x i64> %and1, %and2 + ret <2 x i64> %or +} +; CHECK-LABEL: test10 +; CHECK-NOT: xorps +; CHECK: movsd +; CHECK-NEXT: ret + + +define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { + %and1 = and <4 x i32> %a, + %and2 = and <4 x i32> %b, + %or = or <4 x i32> %and1, %and2 + ret <4 x i32> %or +} +; CHECK-LABEL: test11 +; CHECK-NOT: xorps +; CHECK: movss +; CHECK-NOT: orps +; CHECK: ret + + +define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) { + %and1 = and <4 x i32> %a, + %and2 = and <4 x i32> %b, + %or = or <4 x i32> %and1, %and2 + ret <4 x i32> %or +} +; CHECK-LABEL: test12 +; CHECK-NOT: xorps +; CHECK: movss +; CHECK-NEXT: ret + + +; Verify that the following test cases are folded into single shuffles. + +define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test13 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK-NEXT: ret + + +define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test14 +; CHECK-NOT: pslldq +; CHECK-NOT: por +; CHECK: punpcklqdq +; CHECK-NEXT: ret + + +define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test15 +; CHECK-NOT: xorps +; CHECK: shufps +; CHECK-NOT: shufps +; CHECK-NOT: orps +; CHECK: ret + + +define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test16 +; CHECK-NOT: pslldq +; CHECK-NOT: por +; CHECK: punpcklqdq +; CHECK: ret + + +; Verify that the dag-combiner does not fold a OR of two shuffles into a single +; shuffle instruction when the shuffle indexes are not compatible. + +define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test17 +; CHECK: por +; CHECK-NEXT: ret + + +define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test18 +; CHECK: orps +; CHECK: ret + + +define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} +; CHECK-LABEL: test19 +; CHECK: por +; CHECK-NEXT: ret + + +define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test20 +; CHECK-NOT: xorps +; CHECK: orps +; CHECK-NEXT: ret + + +define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { + %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> + %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> + %or = or <2 x i64> %shuf1, %shuf2 + ret <2 x i64> %or +} +; CHECK-LABEL: test21 +; CHECK: por +; CHECK-NEXT: ret + +