[NVPTX] run LSR before straight-line optimizations

Summary: Straight-line optimizations can simplify the loop body and make LSR's cost analysis more precise. This significantly improves several Eigen3 CUDA benchmarks. With this change, EigenContractionKernel runs up to 40% faster (753ceee5f2/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h (cl-502)). EigenConvolutionKernel2D runs up to 10% faster (753ceee5f2/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h (cl-605)). I have some difficulties writing small tests that benefit from this reordering due to a seemingly issue with LSR (being discussed at http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/088244.html). See the review thread for the compilation time impact of GVN. Reviewers: eliben, jholewinski Subscribers: llvm-commits, jholewinski Differential Revision: http://reviews.llvm.org/D11304 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242982 91177308-0d34-0410-b5e6-96231b3b80d8
2025-08-05 13:26:55 +00:00 · 2015-07-23 04:59:07 +00:00
parent a95022a4f3
commit eb653bd9d1
1 changed files with 32 additions and 5 deletions
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -141,6 +141,10 @@ public:
  FunctionPass *createTargetRegisterAllocator(bool) override;
  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
+
+private:
+  // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE.
+  void addEarlyCSEOrGVNPass();
 };
 } // end anonymous namespace

@@ -155,6 +159,13 @@ TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
  });
 }

+void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+}
+
 void NVPTXPassConfig::addIRPasses() {
  // The following passes are known to not play well with virtual regs hanging
  // around after register allocation (which in our case, is *all* registers).
@@ -166,9 +177,10 @@ void NVPTXPassConfig::addIRPasses() {
  disablePass(&TailDuplicateID);

  addPass(createNVPTXImageOptimizerPass());
-  TargetPassConfig::addIRPasses();
  addPass(createNVPTXAssignValidGlobalNamesPass());
  addPass(createGenericToNVVMPass());
+
+  // === Propagate special address spaces ===
  addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
  // NVPTXLowerKernelArgs emits alloca for byval parameters which can often
  // be eliminated by SROA.
@@ -179,6 +191,8 @@ void NVPTXPassConfig::addIRPasses() {
  // them unused. We could remove dead code in an ad-hoc manner, but that
  // requires manual work and might be error-prone.
  addPass(createDeadCodeEliminationPass());
+
+  // === Straight-line scalar optimizations ===
  addPass(createSeparateConstOffsetFromGEPPass());
  addPass(createSpeculativeExecutionPass());
  // ReassociateGEPs exposes more opportunites for SLSR. See
@@ -187,15 +201,28 @@ void NVPTXPassConfig::addIRPasses() {
  // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
  // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
  // for some of our benchmarks.
-  if (getOptLevel() == CodeGenOpt::Aggressive)
-    addPass(createGVNPass());
-  else
-    addPass(createEarlyCSEPass());
+  addEarlyCSEOrGVNPass();
  // Run NaryReassociate after EarlyCSE/GVN to be more effective.
  addPass(createNaryReassociatePass());
  // NaryReassociate on GEPs creates redundant common expressions, so run
  // EarlyCSE after it.
  addPass(createEarlyCSEPass());
+
+  // === LSR and other generic IR passes ===
+  TargetPassConfig::addIRPasses();
+  // EarlyCSE is not always strong enough to clean up what LSR produces. For
+  // example, GVN can combine
+  //
+  //   %0 = add %a, %b
+  //   %1 = add %b, %a
+  //
+  // and
+  //
+  //   %0 = shl nsw %a, 2
+  //   %1 = shl %a, 2
+  //
+  // but EarlyCSE can do neither of them.
+  addEarlyCSEOrGVNPass();
 }

 bool NVPTXPassConfig::addInstSelector() {