[X86][SSE] Avoid scalarization of v2i64 vector shifts

Currently v2i64 vectors shifts (non-equal shift amounts) are scalarized, costing 4 x extract, 2 x x86-shifts and 2 x insert instructions - and it gets even more awkward on 32-bit targets. This patch separately shifts the vector by both shift amounts and then shuffles the partial results back together, costing 2 x shuffles and 2 x sse-shifts instructions (+ 2 movs on pre-AVX hardware). Note - this patch only improves the SHL / LSHR logical shifts as only these are supported in SSE hardware. Differential Revision: http://reviews.llvm.org/D8416 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232660 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-13 04:30:23 +00:00 · 2015-03-18 19:35:31 +00:00 · 2015-03-18 19:35:31 +00:00 · 0ee70a1554
commit 0ee70a1554
parent db4d401364
3 changed files with 41 additions and 19 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -16189,6 +16189,17 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
      return Op;
  }

+  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
+  // shifts per-lane and then shuffle the partial results back together.
+  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
+    // Splat the shift amounts so the scalar shifts above will catch it.
+    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
+    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
+    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
+    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
+    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
+  }
+
  // If possible, lower this packed shift into a vector multiply instead of
  // expanding it into a sequence of scalar shifts.
  // Do this only if the vector shift count is a constant build_vector.
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@ -13,11 +13,16 @@ entry:
  ret void
 }

-; shift1b can't use a packed shift
+; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
 define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
 entry:
 ; CHECK-LABEL: shift1b:
-; CHECK: shll
+; CHECK:       pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:  movdqa %xmm0, %xmm3
+; CHECK-NEXT:  psllq  %xmm2, %xmm3
+; CHECK-NEXT:  movq   {{.*#+}} xmm1 = xmm1[0],zero
+; CHECK-NEXT:  psllq  %xmm1, %xmm0
+; CHECK-NEXT:  movsd  {{.*#+}} xmm3 = xmm0[0],xmm3[1]
  %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
  %shl = shl <2 x i64> %val, %shamt
  store <2 x i64> %shl, <2 x i64>* %dst
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@ -118,10 +118,16 @@ entry:

 define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
 entry:
-; CHECK: shr2_nosplat
-; CHECK-NOT:  psrlq
-; CHECK-NOT:  psrlq
-; CHECK:      ret
+; CHECK-LABEL: shr2_nosplat
+; CHECK:       movdqa (%rcx), %xmm1
+; CHECK-NEXT:  movdqa %xmm1, %xmm2
+; CHECK-NEXT:  psrlq  $8, %xmm2
+; CHECK-NEXT:  movdqa %xmm1, %xmm0
+; CHECK-NEXT:  psrlq  $1, %xmm0
+; CHECK-NEXT:  movsd  {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; CHECK-NEXT:  movsd  {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT:  xorpd  %xmm1, %xmm0
+; CHECK-NEXT:  ret
  %B = lshr <2 x i64> %A,  < i64 8, i64 1>
  %C = lshr <2 x i64> %A,  < i64 1, i64 0>
  %K = xor <2 x i64> %B, %C