llvm-6502/test/CodeGen/R600/indirect-private-64.ll
Matt Arsenault 5fbf09a69f R600: Add dag combine for copy of an illegal type.
This helps avoid redundant instructions to unpack, and repack
the vectors. Ideally we could recognize that pattern and eliminate
it. Currently v4i8 and other small element type vectors are scalarized,
so this has the added bonus of avoiding that.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213031 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-15 02:06:31 +00:00

102 lines
3.6 KiB
LLVM

; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
; RUN: llc -march=r600 -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
; SI-LABEL: @private_access_f64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load double addrspace(1)* %in, align 8
%array = alloca double, i32 16, align 8
%ptr = getelementptr double* %array, i32 %b
store double %val, double* %ptr, align 8
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
%result = load double* %ptr, align 8
store double %result, double addrspace(1)* %out, align 8
ret void
}
; SI-LABEL: @private_access_v2f64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load <2 x double> addrspace(1)* %in, align 16
%array = alloca <2 x double>, i32 16, align 16
%ptr = getelementptr <2 x double>* %array, i32 %b
store <2 x double> %val, <2 x double>* %ptr, align 16
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
%result = load <2 x double>* %ptr, align 16
store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
ret void
}
; SI-LABEL: @private_access_i64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-PROMOTE: DS_WRITE_B64
; SI-PROMOTE: DS_READ_B64
define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load i64 addrspace(1)* %in, align 8
%array = alloca i64, i32 16, align 8
%ptr = getelementptr i64* %array, i32 %b
store i64 %val, i64* %ptr, align 8
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
%result = load i64* %ptr, align 8
store i64 %result, i64 addrspace(1)* %out, align 8
ret void
}
; SI-LABEL: @private_access_v2i64_alloca:
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELD_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-ALLOCA: V_MOVRELS_B32_e32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_WRITE_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
; SI-PROMOTE: DS_READ_B32
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
%val = load <2 x i64> addrspace(1)* %in, align 16
%array = alloca <2 x i64>, i32 16, align 16
%ptr = getelementptr <2 x i64>* %array, i32 %b
store <2 x i64> %val, <2 x i64>* %ptr, align 16
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
%result = load <2 x i64>* %ptr, align 16
store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
ret void
}