From 5842a037fb6d63d833e0a3af900d129fc2fad94a Mon Sep 17 00:00:00 2001 From: Stepan Dyatkovskiy Date: Tue, 17 Dec 2013 12:07:33 +0000 Subject: [PATCH] Fix for PR18045: http://llvm.org/bugs/show_bug.cgi?id=18045 Short issue description: For X86 machines with sse < sse4.1 we got failures for some particular load/store vector sequences: $ clang-trunk -m32 -O2 test-case.c fatal error: error in backend: Cannot select: 0x4200920: v4i32,ch = load 0x41d6ab0, 0x4205850, 0x41dcb10 [ORD=82] [ID=58] 0x4205850: i32 = X86ISD::Wrapper 0x41d5490 [ORD=26] [ID=43] 0x41d5490: i32 = TargetGlobalAddress<[4 x i32]* @e> 0 [ORD=26] [ID=23] 0x41dcb10: i32 = undef [ID=2] The reason is that EltsFromConsecutiveLoads could emit such load instruction both before and after legalize stage. Though this instruction is not legal for machines with SSSE3 and lower. The fix: In EltsFromConsecutiveLoads, if we have passed legalize stage, we check whether nodes it emits are legal. P.S.: If you get failure in time from 12:00 and till 22:00 (UTC-8), perhaps I'll slow with response, so you better reject this commit. Thanks! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@197492 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 13 ++++++++++--- test/CodeGen/X86/v4i32load-crash.ll | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/X86/v4i32load-crash.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3be3bf53755..b5aabf7ffa6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5426,7 +5426,8 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, - SDLoc &DL, SelectionDAG &DAG) { + SDLoc &DL, SelectionDAG &DAG, + bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); @@ -5462,7 +5463,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { + + if (isAfterLegalize && + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) + return SDValue(); + SDValue NewLd = SDValue(); + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), @@ -6106,7 +6113,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. - SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; @@ -16379,7 +16386,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); } /// PerformTruncateCombine - Converts truncate operation to diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll new file mode 100644 index 00000000000..052c4c3c61b --- /dev/null +++ b/test/CodeGen/X86/v4i32load-crash.ll @@ -0,0 +1,27 @@ +; RUN: llc --mcpu=x86-64 --mattr=ssse3 < %s + +;PR18045: +;Issue of selection for 'v4i32 load'. +;This instruction is not legal for X86 CPUs with sse < 'sse4.1'. +;This node was generated by X86ISelLowering.cpp, EltsFromConsecutiveLoads +;static function after legilize stage. + +@e = external global [4 x i32], align 4 +@f = external global [4 x i32], align 4 + +; Function Attrs: nounwind +define void @fn3(i32 %el) { +entry: + %0 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 0) + %1 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 1) + %2 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 2) + %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 3) + %4 = insertelement <4 x i32> undef, i32 %0, i32 0 + %5 = insertelement <4 x i32> %4, i32 %1, i32 1 + %6 = insertelement <4 x i32> %5, i32 %2, i32 2 + %7 = insertelement <4 x i32> %6, i32 %3, i32 3 + %8 = add <4 x i32> %6, %7 + store <4 x i32> %8, <4 x i32>* bitcast ([4 x i32]* @f to <4 x i32>*) + ret void +} +