From bfed08e41fac89f79a0185d852f2631f7ec2f717 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 23 Aug 2014 10:25:15 +0000 Subject: [PATCH] [x86] Start fixing a really subtle and terrible form of miscompile in these DAG combines. The DAG auto-CSE thing is truly terrible. Due to it, when RAUW-ing a node with its operand, you can cause its uses to CSE to itself, which then causes their uses to become your uses which causes them to be picked up by the RAUW. For nodes that are determined to be "no-ops", this is "fine". But if the RAUW is one of several steps to enact a transformation, this causes the DAG to really silently eat an discard nodes that you would never expect. It took days for me to actually pinpoint a test case triggering this and a really frustrating amount of time to even comprehend the bug because I never even thought about the ability of RAUW to iteratively consume nodes due to CSE-ing them into itself. To fix this, we have to build up a brand-new chain of operations any time we are combining across (potentially) intervening nodes. But once the logic is added to do this, another issue surfaces: CombineTo eagerly deletes the one node combined, *but no others*. This is... really frustrating. If deleting it makes its operands become dead, those operand nodes often won't go onto the worklist in the order you would want -- they're already on it and not near the top. That means things higher on the worklist will get combined prior to these dead nodes being GCed out of the worklist, and if the chain is long, the immediate users won't be enough to re-detect where the root of the chain is that became single-use again after deleting the dead nodes. The better way to do this is to never immediately delete nodes, and instead to just enqueue them so we can recursively delete them. The combined-from node is typically not on the worklist anyways by virtue of having been popped off.... But that in turn breaks other tests that *require* CombineTo to delete unused nodes. :: sigh :: Fortunately, there is a better way. This whole routine should have been returning the replacement rather than using CombineTo which is quite hacky. Switch to that, and all the pieces fall together. I suspect the same kind of miscompile is possible in the half-shuffle folding code, and potentially the recursive folding code. I'll be switching those over to a pattern more like this one for safety's sake even though I don't immediately have any test cases for them. Note that the only way I got a test case for this instance was with *heavily* DAG combined 256-bit shuffle sequences generated by my fuzzer. ;] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216319 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 74 ++++++++++++++--------- test/CodeGen/X86/vector-shuffle-256-v4.ll | 18 ++++++ 2 files changed, 64 insertions(+), 28 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a685fdb6c7e..a5c9d544a72 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19764,19 +19764,23 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. -static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); - // Walk up a single-use chain looking for a combinable shuffle. + // Walk up a single-use chain looking for a combinable shuffle. Keep a stack + // of the shuffles in the chain so that we can form a fresh chain to replace + // this one. + SmallVector Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: - return false; // Nothing combined! + return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific @@ -19792,8 +19796,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::PSHUFHW: @@ -19801,8 +19806,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::UNPCKL: @@ -19810,25 +19816,28 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) - return false; + return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) - return false; + return SDValue(); + Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: - return false; // Nothing to combine. + return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; + Chain.push_back(V); + // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); @@ -19844,10 +19853,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. - return false; - - // Record the old value to use in RAUW-ing. - SDValue Old = V; + return SDValue(); // Merge this node's mask and our incoming mask. SmallVector VMask = getPSHUFShuffleMask(V); @@ -19856,20 +19862,32 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef Mask, V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // It is possible that one of the combinable shuffles was completely absorbed - // by the other, just replace it and revisit all users in that case. - if (Old.getNode() == V.getNode()) { - DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); - return true; + // Rebuild the chain around this new shuffle. + while (!Chain.empty()) { + SDValue W = Chain.pop_back_val(); + + if (V.getValueType() != W.getOperand(0).getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); + + switch (W.getOpcode()) { + default: + llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); + + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); + } } + if (V.getValueType() != N.getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); - - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - return true; + // Return the new chain to replace N. + return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. @@ -20034,8 +20052,8 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) - return SDValue(); // We combined away this shuffle. + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return NewN; break; } diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 306c85bd478..cde96dbb30f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -363,3 +363,21 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } + +define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: @stress_test1 +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> + %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> + %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> + %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> + + ret <4 x i64> %f +}