diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index ddf251f38bf..743bf6f7a92 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -273,6 +273,16 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { (Dest->getPrimitiveSizeInBits() % 32 == 0); } +bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { + // There aren't really 64-bit registers, but pairs of 32-bit ones and only a + // limited number of native 64-bit operations. Shrinking an operation to fit + // in a single 32-bit register should always be helpful. As currently used, + // this is much less general than the name suggests, and is only used in + // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is + // not profitable, and may actually be harmful. + return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; +} + //===---------------------------------------------------------------------===// // TargetLowering Callbacks //===---------------------------------------------------------------------===// diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 2595c51d166..a2504ef1661 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -87,6 +87,8 @@ public: virtual bool isFNegFree(EVT VT) const override; virtual bool isTruncateFree(EVT Src, EVT Dest) const override; virtual bool isTruncateFree(Type *Src, Type *Dest) const override; + virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + virtual MVT getVectorIdxTy() const override; virtual bool isLoadBitCastBeneficial(EVT, EVT) const override; virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll index 6bbd7f7b510..e775fe7c989 100644 --- a/test/CodeGen/R600/trunc.ll +++ b/test/CodeGen/R600/trunc.ll @@ -16,14 +16,27 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) { ret void } +; SI-LABEL: @trunc_load_shl_i64: +; SI-DAG: S_LOAD_DWORDX2 +; SI-DAG: S_LOAD_DWORD [[SREG:s[0-9]+]], +; SI: S_LSHL_B32 [[SHL:s[0-9]+]], [[SREG]], 2 +; SI: V_MOV_B32_e32 [[VSHL:v[0-9]+]], [[SHL]] +; SI: BUFFER_STORE_DWORD [[VSHL]], +define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) { + %b = shl i64 %a, 2 + %result = trunc i64 %b to i32 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + ; SI-LABEL: @trunc_shl_i64: -; SI: S_LOAD_DWORDX2 -; SI: S_LOAD_DWORDX2 [[SREG:s\[[0-9]+:[0-9]+\]]] -; SI: S_LSHL_B64 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, [[SREG]], 2 -; SI: MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] +; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, +; SI: V_ADD_I32_e32 v[[LO_ADD:[0-9]+]], s[[LO_SREG]], +; SI: V_LSHL_B64 v{{\[}}[[LO_VREG:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2 ; SI: BUFFER_STORE_DWORD v[[LO_VREG]], define void @trunc_shl_i64(i32 addrspace(1)* %out, i64 %a) { - %b = shl i64 %a, 2 + %aa = add i64 %a, 234 ; Prevent shrinking store. + %b = shl i64 %aa, 2 %result = trunc i64 %b to i32 store i32 %result, i32 addrspace(1)* %out, align 4 ret void