From 25540a7f39846c4c11a9fd578b9d3686c568847c Mon Sep 17 00:00:00 2001 From: Eli Bendersky Date: Thu, 3 Apr 2014 21:18:25 +0000 Subject: [PATCH] Optimize away unnecessary address casts. Removes unnecessary casts from non-generic address spaces to the generic address space for certain code patterns. Patch by Jingyue Wu. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205571 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/CMakeLists.txt | 1 + lib/Target/NVPTX/NVPTX.h | 1 + .../NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp | 195 ++++++++++++++++++ lib/Target/NVPTX/NVPTXTargetMachine.cpp | 9 + test/CodeGen/NVPTX/access-non-generic.ll | 91 ++++++++ test/CodeGen/NVPTX/addrspacecast.ll | 4 +- 6 files changed, 299 insertions(+), 2 deletions(-) create mode 100644 lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp create mode 100644 test/CodeGen/NVPTX/access-non-generic.ll diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 029118acf24..915b6ce53ed 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -9,6 +9,7 @@ tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(NVPTXCommonTableGen) set(NVPTXCodeGen_sources + NVPTXFavorNonGenericAddrSpaces.cpp NVPTXFrameLowering.cpp NVPTXInstrInfo.cpp NVPTXISelDAGToDAG.cpp diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 8cbdd47b47e..cd9f96577ba 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -63,6 +63,7 @@ FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMPass(); +FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass(); ModulePass *createNVVMReflectPass(); ModulePass *createNVVMReflectPass(const StringMap& Mapping); MachineFunctionPass *createNVPTXPrologEpilogPass(); diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp new file mode 100644 index 00000000000..5ca2b58746d --- /dev/null +++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -0,0 +1,195 @@ +//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// When a load/store accesses the generic address space, checks whether the +// address is casted from a non-generic address space. If so, remove this +// addrspacecast because accessing non-generic address spaces is typically +// faster. Besides seeking addrspacecasts, this optimization also traces into +// the base pointer of a GEP. +// +// For instance, the code below loads a float from an array allocated in +// addrspace(3). +// +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* +// %1 = gep [10 x float]* %0, i64 0, i64 %i +// %2 = load float* %1 ; emits ld.f32 +// +// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast +// and the GEP to expose more optimization opportunities to function +// optimizeMemoryInst. The intermediate code looks like: +// +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %1 = addrspacecast float addrspace(3)* %0 to float* +// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly +// +// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed +// generic pointers, and folds the load and the addrspacecast into a load from +// the original address space. The final code looks like: +// +// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32 +// +// This pass may remove an addrspacecast in a different BB. Therefore, we +// implement it as a FunctionPass. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +// An option to disable this optimization. Enable it by default. +static cl::opt DisableFavorNonGeneric( + "disable-nvptx-favor-non-generic", + cl::init(false), + cl::desc("Do not convert generic address space usage " + "to non-generic address space usage"), + cl::Hidden); + +namespace { +/// \brief NVPTXFavorNonGenericAddrSpaces +class NVPTXFavorNonGenericAddrSpaces : public FunctionPass { +public: + static char ID; + NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {} + + virtual bool runOnFunction(Function &F) override; + + /// Optimizes load/store instructions. Idx is the index of the pointer operand + /// (0 for load, and 1 for store). Returns true if it changes anything. + bool optimizeMemoryInstruction(Instruction *I, unsigned Idx); + /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X, + /// indices)". This reordering exposes to optimizeMemoryInstruction more + /// optimization opportunities on loads and stores. Returns true if it changes + /// the program. + bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP); +}; +} + +char NVPTXFavorNonGenericAddrSpaces::ID = 0; + +namespace llvm { +void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); +} +INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic", + "Remove unnecessary non-generic-to-generic addrspacecasts", + false, false) + +// Decides whether removing Cast is valid and beneficial. Cast can be an +// instruction or a constant expression. +static bool IsEliminableAddrSpaceCast(Operator *Cast) { + // Returns false if not even an addrspacecast. + if (Cast->getOpcode() != Instruction::AddrSpaceCast) + return false; + + Value *Src = Cast->getOperand(0); + PointerType *SrcTy = cast(Src->getType()); + PointerType *DestTy = cast(Cast->getType()); + // TODO: For now, we only handle the case where the addrspacecast only changes + // the address space but not the type. If the type also changes, we could + // still get rid of the addrspacecast by adding an extra bitcast, but we + // rarely see such scenarios. + if (SrcTy->getElementType() != DestTy->getElementType()) + return false; + + // Checks whether the addrspacecast is from a non-generic address space to the + // generic address space. + return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC && + DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC); +} + +bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP( + GEPOperator *GEP) { + Operator *Cast = dyn_cast(GEP->getPointerOperand()); + if (Cast == nullptr) + return false; + + if (!IsEliminableAddrSpaceCast(Cast)) + return false; + + SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); + if (Instruction *GEPI = dyn_cast(GEP)) { + // %1 = gep (addrspacecast X), indices + // => + // %0 = gep X, indices + // %1 = addrspacecast %0 + GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0), + Indices, + GEP->getName(), + GEPI); + NewGEPI->setIsInBounds(GEP->isInBounds()); + GEP->replaceAllUsesWith( + new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI)); + } else { + // GEP is a constant expression. + Constant *NewGEPCE = ConstantExpr::getGetElementPtr( + cast(Cast->getOperand(0)), + Indices, + GEP->isInBounds()); + GEP->replaceAllUsesWith( + ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType())); + } + + return true; +} + +bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI, + unsigned Idx) { + // If the pointer operand is a GEP, hoist the addrspacecast if any from the + // GEP to expose more optimization opportunites. + if (GEPOperator *GEP = dyn_cast(MI->getOperand(Idx))) { + hoistAddrSpaceCastFromGEP(GEP); + } + + // load/store (addrspacecast X) => load/store X if shortcutting the + // addrspacecast is valid and can improve performance. + // + // e.g., + // %1 = addrspacecast float addrspace(3)* %0 to float* + // %2 = load float* %1 + // -> + // %2 = load float addrspace(3)* %0 + // + // Note: the addrspacecast can also be a constant expression. + if (Operator *Cast = dyn_cast(MI->getOperand(Idx))) { + if (IsEliminableAddrSpaceCast(Cast)) { + MI->setOperand(Idx, Cast->getOperand(0)); + return true; + } + } + + return false; +} + +bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { + if (DisableFavorNonGeneric) + return false; + + bool Changed = false; + for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) { + if (isa(I)) { + // V = load P + Changed |= optimizeMemoryInstruction(I, 0); + } else if (isa(I)) { + // store V, P + Changed |= optimizeMemoryInstruction(I, 1); + } + } + } + return Changed; +} + +FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() { + return new NVPTXFavorNonGenericAddrSpaces(); +} diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 7d7d79314c6..9aa7dbb4fab 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -50,6 +50,7 @@ namespace llvm { void initializeNVVMReflectPass(PassRegistry&); void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); +void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -62,6 +63,8 @@ extern "C" void LLVMInitializeNVPTXTarget() { initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); + initializeNVPTXFavorNonGenericAddrSpacesPass( + *PassRegistry::getPassRegistry()); } static std::string computeDataLayout(const NVPTXSubtarget &ST) { @@ -143,6 +146,12 @@ void NVPTXPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); + addPass(createNVPTXFavorNonGenericAddrSpacesPass()); + // The FavorNonGenericAddrSpaces pass may remove instructions and leave some + // values unused. Therefore, we run a DCE pass right afterwards. We could + // remove unused values in an ad-hoc manner, but it requires manual work and + // might be error-prone. + addPass(createDeadCodeEliminationPass()); } bool NVPTXPassConfig::addInstSelector() { diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll new file mode 100644 index 00000000000..0622aa3cb5f --- /dev/null +++ b/test/CodeGen/NVPTX/access-non-generic.ll @@ -0,0 +1,91 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix PTX +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX +; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR + +@array = internal addrspace(3) global [10 x float] zeroinitializer, align 4 +@scalar = internal addrspace(3) global float 0.000000e+00, align 4 + +; Verifies nvptx-favor-non-generic correctly optimizes generic address space +; usage to non-generic address space usage for the patterns we claim to handle: +; 1. load cast +; 2. store cast +; 3. load gep cast +; 4. store gep cast +; gep and cast can be an instruction or a constant expression. This function +; tries all possible combinations. +define float @ld_st_shared_f32(i32 %i, float %v) { +; IR-LABEL: @ld_st_shared_f32 +; IR-NOT: addrspacecast +; PTX-LABEL: ld_st_shared_f32( + ; load cast + %1 = load float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 +; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; + ; store cast + store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 +; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; + ; use syncthreads to disable optimizations across components + call void @llvm.cuda.syncthreads() +; PTX: bar.sync 0; + + ; cast; load + %2 = addrspacecast float addrspace(3)* @scalar to float* + %3 = load float* %2, align 4 +; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; + ; cast; store + store float %v, float* %2, align 4 +; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; + call void @llvm.cuda.syncthreads() +; PTX: bar.sync 0; + + ; load gep cast + %4 = load float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 +; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; + ; store gep cast + store float %v, float* getelementptr inbounds ([10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 +; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; + call void @llvm.cuda.syncthreads() +; PTX: bar.sync 0; + + ; gep cast; load + %5 = getelementptr inbounds [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5 + %6 = load float* %5, align 4 +; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; + ; gep cast; store + store float %v, float* %5, align 4 +; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; + call void @llvm.cuda.syncthreads() +; PTX: bar.sync 0; + + ; cast; gep; load + %7 = addrspacecast [10 x float] addrspace(3)* @array to [10 x float]* + %8 = getelementptr inbounds [10 x float]* %7, i32 0, i32 %i + %9 = load float* %8, align 4 +; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}]; + ; cast; gep; store + store float %v, float* %8, align 4 +; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}}; + call void @llvm.cuda.syncthreads() +; PTX: bar.sync 0; + + %sum2 = fadd float %1, %3 + %sum3 = fadd float %sum2, %4 + %sum4 = fadd float %sum3, %6 + %sum5 = fadd float %sum4, %9 + ret float %sum5 +} + +; Verifies nvptx-favor-non-generic keeps addrspacecasts between pointers of +; different element types. +define i32 @ld_int_from_float() { +; IR-LABEL: @ld_int_from_float +; IR: addrspacecast +; PTX-LABEL: ld_int_from_float( +; PTX: cvta.shared.u{{(32|64)}} + %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4 + ret i32 %1 +} + +declare void @llvm.cuda.syncthreads() #3 + +attributes #3 = { noduplicate nounwind } + diff --git a/test/CodeGen/NVPTX/addrspacecast.ll b/test/CodeGen/NVPTX/addrspacecast.ll index 98ea655969c..03b9a984475 100644 --- a/test/CodeGen/NVPTX/addrspacecast.ll +++ b/test/CodeGen/NVPTX/addrspacecast.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -check-prefix=PTX32 -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -check-prefix=PTX64 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -disable-nvptx-favor-non-generic | FileCheck %s -check-prefix=PTX64 define i32 @conv1(i32 addrspace(1)* %ptr) {