Optimize away unnecessary address casts.

Removes unnecessary casts from non-generic address spaces to the generic address space for certain code patterns. Patch by Jingyue Wu. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205571 91177308-0d34-0410-b5e6-96231b3b80d8
2024-09-30 19:55:11 +00:00 · 2014-04-03 21:18:25 +00:00 · 2014-04-03 21:18:25 +00:00 · 25540a7f39
commit 25540a7f39
parent 89218827c8
6 changed files with 299 additions and 2 deletions
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@ -9,6 +9,7 @@ tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(NVPTXCommonTableGen)

 set(NVPTXCodeGen_sources
+  NVPTXFavorNonGenericAddrSpaces.cpp
  NVPTXFrameLowering.cpp
  NVPTXInstrInfo.cpp
  NVPTXISelDAGToDAG.cpp
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@ -63,6 +63,7 @@ FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
 ModulePass *createNVVMReflectPass();
 ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
--- a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@ -0,0 +1,195 @@
+//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When a load/store accesses the generic address space, checks whether the
+// address is casted from a non-generic address space. If so, remove this
+// addrspacecast because accessing non-generic address spaces is typically
+// faster. Besides seeking addrspacecasts, this optimization also traces into
+// the base pointer of a GEP.
+//
+// For instance, the code below loads a float from an array allocated in
+// addrspace(3).
+//
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %2 = load float* %1 ; emits ld.f32
+//
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
+// and the GEP to expose more optimization opportunities to function
+// optimizeMemoryInst. The intermediate code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %1 = addrspacecast float addrspace(3)* %0 to float*
+// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//
+// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
+// generic pointers, and folds the load and the addrspacecast into a load from
+// the original address space. The final code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//
+// This pass may remove an addrspacecast in a different BB. Therefore, we
+// implement it as a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// An option to disable this optimization. Enable it by default.
+static cl::opt<bool> DisableFavorNonGeneric(
+  "disable-nvptx-favor-non-generic",
+  cl::init(false),
+  cl::desc("Do not convert generic address space usage "
+           "to non-generic address space usage"),
+  cl::Hidden);
+
+namespace {
+/// \brief NVPTXFavorNonGenericAddrSpaces
+class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
+public:
+  static char ID;
+  NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
+
+  virtual bool runOnFunction(Function &F) override;
+
+  /// Optimizes load/store instructions. Idx is the index of the pointer operand
+  /// (0 for load, and 1 for store). Returns true if it changes anything.
+  bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
+  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores. Returns true if it changes
+  /// the program.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+};
+}
+
+char NVPTXFavorNonGenericAddrSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
+                "Remove unnecessary non-generic-to-generic addrspacecasts",
+                false, false)
+
+// Decides whether removing Cast is valid and beneficial. Cast can be an
+// instruction or a constant expression.
+static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+  // Returns false if not even an addrspacecast.
+  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+    return false;
+
+  Value *Src = Cast->getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType());
+  PointerType *DestTy = cast<PointerType>(Cast->getType());
+  // TODO: For now, we only handle the case where the addrspacecast only changes
+  // the address space but not the type. If the type also changes, we could
+  // still get rid of the addrspacecast by adding an extra bitcast, but we
+  // rarely see such scenarios.
+  if (SrcTy->getElementType() != DestTy->getElementType())
+    return false;
+
+  // Checks whether the addrspacecast is from a non-generic address space to the
+  // generic address space.
+  return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+          DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
+    GEPOperator *GEP) {
+  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
+  if (Cast == nullptr)
+    return false;
+
+  if (!IsEliminableAddrSpaceCast(Cast))
+    return false;
+
+  SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
+  if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+    // %1 = gep (addrspacecast X), indices
+    // =>
+    // %0 = gep X, indices
+    // %1 = addrspacecast %0
+    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
+                                                           Indices,
+                                                           GEP->getName(),
+                                                           GEPI);
+    NewGEPI->setIsInBounds(GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+  } else {
+    // GEP is a constant expression.
+    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+        cast<Constant>(Cast->getOperand(0)),
+        Indices,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+  }
+
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
+  // GEP to expose more optimization opportunites.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
+    hoistAddrSpaceCastFromGEP(GEP);
+  }
+
+  // load/store (addrspacecast X) => load/store X if shortcutting the
+  // addrspacecast is valid and can improve performance.
+  //
+  // e.g.,
+  // %1 = addrspacecast float addrspace(3)* %0 to float*
+  // %2 = load float* %1
+  // ->
+  // %2 = load float addrspace(3)* %0
+  //
+  // Note: the addrspacecast can also be a constant expression.
+  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
+    if (IsEliminableAddrSpaceCast(Cast)) {
+      MI->setOperand(Idx, Cast->getOperand(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
+  if (DisableFavorNonGeneric)
+    return false;
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+      if (isa<LoadInst>(I)) {
+        // V = load P
+        Changed |= optimizeMemoryInstruction(I, 0);
+      } else if (isa<StoreInst>(I)) {
+        // store V, P
+        Changed |= optimizeMemoryInstruction(I, 1);
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() {
+  return new NVPTXFavorNonGenericAddrSpaces();
+}
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@ -50,6 +50,7 @@ namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
 }

 extern "C" void LLVMInitializeNVPTXTarget() {
@ -62,6 +63,8 @@ extern "C" void LLVMInitializeNVPTXTarget() {
  initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
  initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXFavorNonGenericAddrSpacesPass(
+    *PassRegistry::getPassRegistry());
 }

 static std::string computeDataLayout(const NVPTXSubtarget &ST) {
@ -143,6 +146,12 @@ void NVPTXPassConfig::addIRPasses() {
  TargetPassConfig::addIRPasses();
  addPass(createNVPTXAssignValidGlobalNamesPass());
  addPass(createGenericToNVVMPass());
+  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  // The FavorNonGenericAddrSpaces pass may remove instructions and leave some
+  // values unused. Therefore, we run a DCE pass right afterwards. We could
+  // remove unused values in an ad-hoc manner, but it requires manual work and
+  // might be error-prone.
+  addPass(createDeadCodeEliminationPass());
 }

 bool NVPTXPassConfig::addInstSelector() {
--- a/test/CodeGen/NVPTX/access-non-generic.ll
+++ b/test/CodeGen/NVPTX/access-non-generic.ll
@ -0,0 +1,91 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR
+
+@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
+@scalar = internal addrspace(3) global float 0.000000e+00, align 4
+
+; Verifies nvptx-favor-non-generic correctly optimizes generic address space
+; usage to non-generic address space usage for the patterns we claim to handle:
+; 1. load cast
+; 2. store cast
+; 3. load gep cast
+; 4. store gep cast
+; gep and cast can be an instruction or a constant expression. This function
+; tries all possible combinations.
+define float @ld_st_shared_f32(i32 %i, float %v) {
+; IR-LABEL: @ld_st_shared_f32
+; IR-NOT: addrspacecast
+; PTX-LABEL: ld_st_shared_f32(
+  ; load cast
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; store cast
+  store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  ; use syncthreads to disable optimizations across components
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; load
+  %2 = addrspacecast float addrspace(3)* @scalar to float*
+  %3 = load float* %2, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
+  ; cast; store
+  store float %v, float* %2, align 4
+; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; load gep cast
+  %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; store gep cast
+  store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; gep cast; load
+  %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5
+  %6 = load float* %5, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
+  ; gep cast; store
+  store float %v, float* %5, align 4
+; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  ; cast; gep; load
+  %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]*
+  %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i
+  %9 = load float* %8, align 4
+; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
+  ; cast; gep; store
+  store float %v, float* %8, align 4
+; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
+  call void @llvm.cuda.syncthreads()
+; PTX: bar.sync 0;
+
+  %sum2 = fadd float %1, %3
+  %sum3 = fadd float %sum2, %4
+  %sum4 = fadd float %sum3, %6
+  %sum5 = fadd float %sum4, %9
+  ret float %sum5
+}
+
+; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of
+; different element types.
+define i32 @ld_int_from_float() {
+; IR-LABEL: @ld_int_from_float
+; IR: addrspacecast
+; PTX-LABEL: ld_int_from_float(
+; PTX: cvta.shared.u{{(32|64)}}
+  %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+  ret i32 %1
+}
+
+declare void @llvm.cuda.syncthreads() #3
+
+attributes #3 = { noduplicate nounwind }
+
--- a/test/CodeGen/NVPTX/addrspacecast.ll
+++ b/test/CodeGen/NVPTX/addrspacecast.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX64


 define i32 @conv1(i32 addrspace(1)* %ptr) {