From 093b0413fe827b48306ef459af7690ba9df14a8e Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 31 Jan 2014 21:14:40 +0000
Subject: [PATCH] [SLPV] Recognize vectorizable intrinsics during SLP
 vectorization and transform accordingly. Based on similar code from Loop
 vectorization. Subsequent commits will include vectorization of function
 calls to vector intrinsics and form function calls to vector library calls.

Patch by Raul Silvera! (Much delayed due to my not running dcommit)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200576 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/SLPVectorizer.cpp    | 89 ++++++++++++++++++-
 .../Transforms/SLPVectorizer/X86/intrinsic.ll | 75 ++++++++++++++++
 2 files changed, 161 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/SLPVectorizer/X86/intrinsic.ll
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 80d9ffccafb..0dec4b05f95 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -947,6 +947,39 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       buildTree_rec(Operands, Depth + 1);
       return;
     }
+    case Instruction::Call: {
+      // Check if the calls are all to the same vectorizable intrinsic.
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(VL[0]);
+      if (II==NULL) {
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+        return;
+      }
+
+      Intrinsic::ID ID = II->getIntrinsicID();
+
+      for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+        IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[i]);
+        if (!II2 || II2->getIntrinsicID() != ID) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i]
+                       << "\n");
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true);
+      for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j) {
+          IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[j]);
+          Operands.push_back(II2->getArgOperand(i));
+        }
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1072,6 +1105,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
       return VecStCost - ScalarStCost;
     }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+      IntrinsicInst *II = cast<IntrinsicInst>(CI);
+      Intrinsic::ID ID = II->getIntrinsicID();
+
+      // Calculate the cost of the scalar and vector calls.
+      SmallVector<Type*, 4> ScalarTys, VecTys;
+      for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) {
+        ScalarTys.push_back(CI->getArgOperand(op)->getType());
+        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+                                         VecTy->getNumElements()));
+      }
+
+      int ScalarCallCost = VecTy->getNumElements() *
+          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
+
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
+
+      DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
+            << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
+            << " for " << *II << "\n");
+
+      return VecCallCost - ScalarCallCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1086,10 +1143,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() {
     return false;
 
   // Gathering cost would be too much for tiny trees.
-  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 
-    return false; 
+  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
+    return false;
 
-  return true; 
+  return true;
 }
 
 int BoUpSLP::getTreeCost() {
@@ -1555,6 +1612,32 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       E->VectorizedValue = S;
       return propagateMetadata(S, E->Scalars);
     }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(VL0);
+
+      setInsertPointAfterBundle(E->Scalars);
+      std::vector<Value *> OpVecs;
+      for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+        ValueList OpVL;
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+          CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+          OpVL.push_back(CEI->getArgOperand(j));
+        }
+
+        Value *OpVec = vectorizeTree(OpVL);
+        DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+        OpVecs.push_back(OpVec);
+      }
+
+      Module *M = F->getParent();
+      IntrinsicInst *II = cast<IntrinsicInst>(CI);
+      Intrinsic::ID ID = II->getIntrinsicID();
+      Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+      Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+      Value *V = Builder.CreateCall(CF, OpVecs);
+      E->VectorizedValue = V;
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
diff --git a/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
new file mode 100644
index 00000000000..2b7ee754df5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+
+;CHECK-LABEL: @vec_fabs_f64(
+;CHECK: load <2 x double>
+;CHECK: load <2 x double>
+;CHECK: call <2 x double> @llvm.fabs.v2f64
+;CHECK: store <2 x double>
+;CHECK: ret
+define void @vec_fabs_f64(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @vec_copysign_f32(
+;CHECK: load <4 x float>
+;CHECK: load <4 x float>
+;CHECK: call <4 x float> @llvm.copysign.v4f32
+;CHECK: store <4 x float>
+;CHECK: ret
+define void @vec_copysign_f32(float* %a, float* %b, float* noalias %c) {
+entry:
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  store float %call0, float* %c, align 4
+
+  %ix2 = getelementptr inbounds float* %a, i64 1
+  %2 = load float* %ix2, align 4
+  %ix3 = getelementptr inbounds float* %b, i64 1
+  %3 = load float* %ix3, align 4
+  %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
+  %c1 = getelementptr inbounds float* %c, i64 1
+  store float %call1, float* %c1, align 4
+
+  %ix4 = getelementptr inbounds float* %a, i64 2
+  %4 = load float* %ix4, align 4
+  %ix5 = getelementptr inbounds float* %b, i64 2
+  %5 = load float* %ix5, align 4
+  %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
+  %c2 = getelementptr inbounds float* %c, i64 2
+  store float %call2, float* %c2, align 4
+
+  %ix6 = getelementptr inbounds float* %a, i64 3
+  %6 = load float* %ix6, align 4
+  %ix7 = getelementptr inbounds float* %b, i64 3
+  %7 = load float* %ix7, align 4
+  %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
+  %c3 = getelementptr inbounds float* %c, i64 3
+  store float %call3, float* %c3, align 4
+
+  ret void
+}
+
+
+