diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2ec08140db5..1af24497ba7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13042,7 +13042,8 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) // since the result of setcc_c is all zero's or all ones. - if (N1C && N0.getOpcode() == ISD::AND && + if (VT.isInteger() && !VT.isVector() && + N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY || @@ -13058,6 +13059,22 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { } } + + // Hardware support for vector shifts is sparse which makes us scalarize the + // vector operations in many cases. Also, on sandybridge ADD is faster than + // shl. + // (shl V, 1) -> add V,V + if (isSplatVector(N1.getNode())) { + assert(N0.getValueType().isVector() && "Invalid vector shift type"); + ConstantSDNode *N1C = dyn_cast(N1->getOperand(0)); + // We shift all of the values by one. In many cases we do not have + // hardware support for this operation. This is better expressed as an ADD + // of two values. + if (N1C && (1 == N1C->getZExtValue())) { + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0); + } + } + return SDValue(); } @@ -13066,9 +13083,10 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); - if (!VT.isVector() && VT.isInteger() && - N->getOpcode() == ISD::SHL) - return PerformSHLCombine(N, DAG); + if (N->getOpcode() == ISD::SHL) { + SDValue V = PerformSHLCombine(N, DAG); + if (V.getNode()) return V; + } // On X86 with SSE2 support, we can transform this to a vector shift if // all elements are shifted by the same amount. We can't do this in legalize diff --git a/test/CodeGen/X86/2011-10-30-padd.ll b/test/CodeGen/X86/2011-10-30-padd.ll new file mode 100644 index 00000000000..180ca15a0ee --- /dev/null +++ b/test/CodeGen/X86/2011-10-30-padd.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s + +;CHECK: addXX_test +;CHECK: padd +;CHECK: ret + + +define <16 x i8> @addXX_test(<16 x i8> %a) { + %b = add <16 x i8> %a, %a + ret <16 x i8> %b +} + +;CHECK: instcombine_test +;CHECK: padd +;CHECK: ret +define <16 x i8> @instcombine_test(<16 x i8> %a) { + %b = shl <16 x i8> %a, + ret <16 x i8> %b +} + diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll index 1cb07aa0824..5a91b090472 100644 --- a/test/CodeGen/X86/x86-shifts.ll +++ b/test/CodeGen/X86/x86-shifts.ll @@ -6,8 +6,9 @@ define <4 x i32> @shl4(<4 x i32> %A) nounwind { entry: ; CHECK: shl4 +; CHECK: padd ; CHECK: pslld -; CHECK-NEXT: pslld +; CHECK: ret %B = shl <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2> %C = shl <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1> %K = xor <4 x i32> %B, %C @@ -19,6 +20,7 @@ entry: ; CHECK: shr4 ; CHECK: psrld ; CHECK-NEXT: psrld +; CHECK: ret %B = lshr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2> %C = lshr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1> %K = xor <4 x i32> %B, %C @@ -30,6 +32,7 @@ entry: ; CHECK: sra4 ; CHECK: psrad ; CHECK-NEXT: psrad +; CHECK: ret %B = ashr <4 x i32> %A, < i32 2, i32 2, i32 2, i32 2> %C = ashr <4 x i32> %A, < i32 1, i32 1, i32 1, i32 1> %K = xor <4 x i32> %B, %C @@ -41,6 +44,7 @@ entry: ; CHECK: shl2 ; CHECK: psllq ; CHECK-NEXT: psllq +; CHECK: ret %B = shl <2 x i64> %A, < i64 2, i64 2> %C = shl <2 x i64> %A, < i64 9, i64 9> %K = xor <2 x i64> %B, %C @@ -52,6 +56,7 @@ entry: ; CHECK: shr2 ; CHECK: psrlq ; CHECK-NEXT: psrlq +; CHECK: ret %B = lshr <2 x i64> %A, < i64 8, i64 8> %C = lshr <2 x i64> %A, < i64 1, i64 1> %K = xor <2 x i64> %B, %C @@ -62,8 +67,9 @@ entry: define <8 x i16> @shl8(<8 x i16> %A) nounwind { entry: ; CHECK: shl8 +; CHECK: padd ; CHECK: psllw -; CHECK-NEXT: psllw +; CHECK: ret %B = shl <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> %C = shl <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> %K = xor <8 x i16> %B, %C @@ -75,6 +81,7 @@ entry: ; CHECK: shr8 ; CHECK: psrlw ; CHECK-NEXT: psrlw +; CHECK: ret %B = lshr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> %C = lshr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> %K = xor <8 x i16> %B, %C @@ -86,6 +93,7 @@ entry: ; CHECK: sra8 ; CHECK: psraw ; CHECK-NEXT: psraw +; CHECK: ret %B = ashr <8 x i16> %A, < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> %C = ashr <8 x i16> %A, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> %K = xor <8 x i16> %B, %C @@ -100,6 +108,7 @@ entry: ; CHECK: sll8_nosplat ; CHECK-NOT: psll ; CHECK-NOT: psll +; CHECK: ret %B = shl <8 x i16> %A, < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2> %C = shl <8 x i16> %A, < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1> %K = xor <8 x i16> %B, %C @@ -112,6 +121,7 @@ entry: ; CHECK: shr2_nosplat ; CHECK-NOT: psrlq ; CHECK-NOT: psrlq +; CHECK: ret %B = lshr <2 x i64> %A, < i64 8, i64 1> %C = lshr <2 x i64> %A, < i64 1, i64 0> %K = xor <2 x i64> %B, %C @@ -125,6 +135,7 @@ define <2 x i32> @shl2_other(<2 x i32> %A) nounwind { entry: ; CHECK: shl2_other ; CHECK: psllq +; CHECK: ret %B = shl <2 x i32> %A, < i32 2, i32 2> %C = shl <2 x i32> %A, < i32 9, i32 9> %K = xor <2 x i32> %B, %C @@ -135,6 +146,7 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind { entry: ; CHECK: shr2_other ; CHECK: psrlq +; CHECK: ret %B = lshr <2 x i32> %A, < i32 8, i32 8> %C = lshr <2 x i32> %A, < i32 1, i32 1> %K = xor <2 x i32> %B, %C