[NVPTX] make load on global readonly memory to use ldg

Summary:
[NVPTX] make load on global readonly memory to use ldg

Summary:
As describe in [1], ld.global.nc may be used to load memory by nvcc when
__restrict__ is used and compiler can detect whether read-only data cache
is safe to use.

This patch will try to check whether ldg is safe to use and use them to
replace ld.global when possible. This change can improve the performance
by 18~29% on affected kernels (ratt*_kernel and rwdot*_kernel) in 
S3D benchmark of shoc [2]. 

Patched by Xuetian Weng. 

[1] http://docs.nvidia.com/cuda/kepler-tuning-guide/#read-only-data-cache
[2] https://github.com/vetter/shoc

Test Plan: test/CodeGen/NVPTX/load-with-non-coherent-cache.ll

Reviewers: jholewinski, jingyue

Subscribers: jholewinski, llvm-commits

Differential Revision: http://reviews.llvm.org/D11314

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242713 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Jingyue Wu 2015-07-20 21:28:54 +00:00
parent d94e17bde9
commit c9f86c1260
2 changed files with 246 additions and 0 deletions

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
#include "NVPTXISelDAGToDAG.h" #include "NVPTXISelDAGToDAG.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h" #include "llvm/IR/Instructions.h"
#include "llvm/Support/CommandLine.h" #include "llvm/Support/CommandLine.h"
@ -544,6 +545,21 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC; return NVPTX::PTXLdStInstCode::GENERIC;
} }
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned codeAddrSpace, const DataLayout &DL) {
if (!Subtarget.hasLDG() || codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) {
return false;
}
// Check whether load operates on a readonly argument.
bool canUseLDG = false;
if (const Argument *A = dyn_cast<const Argument>(
GetUnderlyingObject(N->getMemOperand()->getValue(), DL)))
canUseLDG = A->onlyReadsMemory() && A->hasNoAliasAttr();
return canUseLDG;
}
SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IID) { switch (IID) {
@ -638,6 +654,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
// Address Space Setting // Address Space Setting
unsigned int codeAddrSpace = getCodeAddrSpace(LD); unsigned int codeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, CurDAG->getDataLayout())) {
return SelectLDGLDU(N);
}
// Volatile Setting // Volatile Setting
// - .volatile is only availalble for .global and .shared // - .volatile is only availalble for .global and .shared
bool isVolatile = LD->isVolatile(); bool isVolatile = LD->isVolatile();
@ -872,6 +892,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
// Address Space Setting // Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, CurDAG->getDataLayout())) {
return SelectLDGLDU(N);
}
// Volatile Setting // Volatile Setting
// - .volatile is only availalble for .global and .shared // - .volatile is only availalble for .global and .shared
bool IsVolatile = MemSD->isVolatile(); bool IsVolatile = MemSD->isVolatile();
@ -1425,6 +1449,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) { switch (N->getOpcode()) {
default: default:
return nullptr; return nullptr;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) { if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
@ -1474,6 +1499,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} }
} }
break; break;
case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2: case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1522,6 +1548,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break; break;
} }
break; break;
case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4: case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1563,6 +1590,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) { switch (N->getOpcode()) {
default: default:
return nullptr; return nullptr;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) { if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
@ -1612,6 +1640,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} }
} }
break; break;
case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2: case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1660,6 +1689,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break; break;
} }
break; break;
case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4: case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1707,6 +1737,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) { switch (N->getOpcode()) {
default: default:
return nullptr; return nullptr;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) { if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
@ -1756,6 +1787,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} }
} }
break; break;
case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2: case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1804,6 +1836,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break; break;
} }
break; break;
case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4: case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1845,6 +1878,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
switch (N->getOpcode()) { switch (N->getOpcode()) {
default: default:
return nullptr; return nullptr;
case ISD::LOAD:
case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_W_CHAIN:
if (IsLDG) { if (IsLDG) {
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
@ -1894,6 +1928,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
} }
} }
break; break;
case NVPTXISD::LoadV2:
case NVPTXISD::LDGV2: case NVPTXISD::LDGV2:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:
@ -1942,6 +1977,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) {
break; break;
} }
break; break;
case NVPTXISD::LoadV4:
case NVPTXISD::LDGV4: case NVPTXISD::LDGV4:
switch (EltVT.getSimpleVT().SimpleTy) { switch (EltVT.getSimpleVT().SimpleTy) {
default: default:

View File

@ -0,0 +1,210 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefix=SM20 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck -check-prefix=SM35 %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
; SM20-LABEL: .visible .entry foo1(
; SM20: ld.global.f32
; SM35-LABEL: .visible .entry foo1(
; SM35: ld.global.nc.f32
define void @foo1(float * noalias readonly %from, float * %to) {
%1 = load float, float * %from
store float %1, float * %to
ret void
}
; SM20-LABEL: .visible .entry foo2(
; SM20: ld.global.f64
; SM35-LABEL: .visible .entry foo2(
; SM35: ld.global.nc.f64
define void @foo2(double * noalias readonly %from, double * %to) {
%1 = load double, double * %from
store double %1, double * %to
ret void
}
; SM20-LABEL: .visible .entry foo3(
; SM20: ld.global.u16
; SM35-LABEL: .visible .entry foo3(
; SM35: ld.global.nc.u16
define void @foo3(i16 * noalias readonly %from, i16 * %to) {
%1 = load i16, i16 * %from
store i16 %1, i16 * %to
ret void
}
; SM20-LABEL: .visible .entry foo4(
; SM20: ld.global.u32
; SM35-LABEL: .visible .entry foo4(
; SM35: ld.global.nc.u32
define void @foo4(i32 * noalias readonly %from, i32 * %to) {
%1 = load i32, i32 * %from
store i32 %1, i32 * %to
ret void
}
; SM20-LABEL: .visible .entry foo5(
; SM20: ld.global.u64
; SM35-LABEL: .visible .entry foo5(
; SM35: ld.global.nc.u64
define void @foo5(i64 * noalias readonly %from, i64 * %to) {
%1 = load i64, i64 * %from
store i64 %1, i64 * %to
ret void
}
; i128 is non standard integer in nvptx64
; SM20-LABEL: .visible .entry foo6(
; SM20: ld.global.u64
; SM20: ld.global.u64
; SM35-LABEL: .visible .entry foo6(
; SM35: ld.global.nc.u64
; SM35: ld.global.nc.u64
define void @foo6(i128 * noalias readonly %from, i128 * %to) {
%1 = load i128, i128 * %from
store i128 %1, i128 * %to
ret void
}
; SM20-LABEL: .visible .entry foo7(
; SM20: ld.global.v2.u8
; SM35-LABEL: .visible .entry foo7(
; SM35: ld.global.nc.v2.u8
define void @foo7(<2 x i8> * noalias readonly %from, <2 x i8> * %to) {
%1 = load <2 x i8>, <2 x i8> * %from
store <2 x i8> %1, <2 x i8> * %to
ret void
}
; SM20-LABEL: .visible .entry foo8(
; SM20: ld.global.v2.u16
; SM35-LABEL: .visible .entry foo8(
; SM35: ld.global.nc.v2.u16
define void @foo8(<2 x i16> * noalias readonly %from, <2 x i16> * %to) {
%1 = load <2 x i16>, <2 x i16> * %from
store <2 x i16> %1, <2 x i16> * %to
ret void
}
; SM20-LABEL: .visible .entry foo9(
; SM20: ld.global.v2.u32
; SM35-LABEL: .visible .entry foo9(
; SM35: ld.global.nc.v2.u32
define void @foo9(<2 x i32> * noalias readonly %from, <2 x i32> * %to) {
%1 = load <2 x i32>, <2 x i32> * %from
store <2 x i32> %1, <2 x i32> * %to
ret void
}
; SM20-LABEL: .visible .entry foo10(
; SM20: ld.global.v2.u64
; SM35-LABEL: .visible .entry foo10(
; SM35: ld.global.nc.v2.u64
define void @foo10(<2 x i64> * noalias readonly %from, <2 x i64> * %to) {
%1 = load <2 x i64>, <2 x i64> * %from
store <2 x i64> %1, <2 x i64> * %to
ret void
}
; SM20-LABEL: .visible .entry foo11(
; SM20: ld.global.v2.f32
; SM35-LABEL: .visible .entry foo11(
; SM35: ld.global.nc.v2.f32
define void @foo11(<2 x float> * noalias readonly %from, <2 x float> * %to) {
%1 = load <2 x float>, <2 x float> * %from
store <2 x float> %1, <2 x float> * %to
ret void
}
; SM20-LABEL: .visible .entry foo12(
; SM20: ld.global.v2.f64
; SM35-LABEL: .visible .entry foo12(
; SM35: ld.global.nc.v2.f64
define void @foo12(<2 x double> * noalias readonly %from, <2 x double> * %to) {
%1 = load <2 x double>, <2 x double> * %from
store <2 x double> %1, <2 x double> * %to
ret void
}
; SM20-LABEL: .visible .entry foo13(
; SM20: ld.global.v4.u8
; SM35-LABEL: .visible .entry foo13(
; SM35: ld.global.nc.v4.u8
define void @foo13(<4 x i8> * noalias readonly %from, <4 x i8> * %to) {
%1 = load <4 x i8>, <4 x i8> * %from
store <4 x i8> %1, <4 x i8> * %to
ret void
}
; SM20-LABEL: .visible .entry foo14(
; SM20: ld.global.v4.u16
; SM35-LABEL: .visible .entry foo14(
; SM35: ld.global.nc.v4.u16
define void @foo14(<4 x i16> * noalias readonly %from, <4 x i16> * %to) {
%1 = load <4 x i16>, <4 x i16> * %from
store <4 x i16> %1, <4 x i16> * %to
ret void
}
; SM20-LABEL: .visible .entry foo15(
; SM20: ld.global.v4.u32
; SM35-LABEL: .visible .entry foo15(
; SM35: ld.global.nc.v4.u32
define void @foo15(<4 x i32> * noalias readonly %from, <4 x i32> * %to) {
%1 = load <4 x i32>, <4 x i32> * %from
store <4 x i32> %1, <4 x i32> * %to
ret void
}
; SM20-LABEL: .visible .entry foo16(
; SM20: ld.global.v4.f32
; SM35-LABEL: .visible .entry foo16(
; SM35: ld.global.nc.v4.f32
define void @foo16(<4 x float> * noalias readonly %from, <4 x float> * %to) {
%1 = load <4 x float>, <4 x float> * %from
store <4 x float> %1, <4 x float> * %to
ret void
}
; SM20-LABEL: .visible .entry foo17(
; SM20: ld.global.v2.f64
; SM20: ld.global.v2.f64
; SM35-LABEL: .visible .entry foo17(
; SM35: ld.global.nc.v2.f64
; SM35: ld.global.nc.v2.f64
define void @foo17(<4 x double> * noalias readonly %from, <4 x double> * %to) {
%1 = load <4 x double>, <4 x double> * %from
store <4 x double> %1, <4 x double> * %to
ret void
}
; SM20-LABEL: .visible .entry foo18(
; SM20: ld.global.u64
; SM35-LABEL: .visible .entry foo18(
; SM35: ld.global.nc.u64
define void @foo18(float ** noalias readonly %from, float ** %to) {
%1 = load float *, float ** %from
store float * %1, float ** %to
ret void
}
!nvvm.annotations = !{!1 ,!2 ,!3 ,!4 ,!5 ,!6, !7 ,!8 ,!9 ,!10 ,!11 ,!12, !13, !14, !15, !16, !17, !18}
!1 = !{void (float *, float *)* @foo1, !"kernel", i32 1}
!2 = !{void (double *, double *)* @foo2, !"kernel", i32 1}
!3 = !{void (i16 *, i16 *)* @foo3, !"kernel", i32 1}
!4 = !{void (i32 *, i32 *)* @foo4, !"kernel", i32 1}
!5 = !{void (i64 *, i64 *)* @foo5, !"kernel", i32 1}
!6 = !{void (i128 *, i128 *)* @foo6, !"kernel", i32 1}
!7 = !{void (<2 x i8> *, <2 x i8> *)* @foo7, !"kernel", i32 1}
!8 = !{void (<2 x i16> *, <2 x i16> *)* @foo8, !"kernel", i32 1}
!9 = !{void (<2 x i32> *, <2 x i32> *)* @foo9, !"kernel", i32 1}
!10 = !{void (<2 x i64> *, <2 x i64> *)* @foo10, !"kernel", i32 1}
!11 = !{void (<2 x float> *, <2 x float> *)* @foo11, !"kernel", i32 1}
!12 = !{void (<2 x double> *, <2 x double> *)* @foo12, !"kernel", i32 1}
!13 = !{void (<4 x i8> *, <4 x i8> *)* @foo13, !"kernel", i32 1}
!14 = !{void (<4 x i16> *, <4 x i16> *)* @foo14, !"kernel", i32 1}
!15 = !{void (<4 x i32> *, <4 x i32> *)* @foo15, !"kernel", i32 1}
!16 = !{void (<4 x float> *, <4 x float> *)* @foo16, !"kernel", i32 1}
!17 = !{void (<4 x double> *, <4 x double> *)* @foo17, !"kernel", i32 1}
!18 = !{void (float **, float **)* @foo18, !"kernel", i32 1}