From 24e874a1dd608b6ab53bfcd78088efa9d6e009fb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 14 Nov 2014 20:08:52 +0000 Subject: [PATCH] R600/SI: Combine min3/max3 instructions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222032 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 6 ++ lib/Target/R600/AMDGPUISelLowering.h | 6 ++ lib/Target/R600/AMDGPUInstrInfo.td | 33 +++++++- lib/Target/R600/SIISelLowering.cpp | 68 +++++++++++++++ lib/Target/R600/SIISelLowering.h | 2 + lib/Target/R600/SIInstructions.td | 30 +++++-- test/CodeGen/R600/fmax3.ll | 38 +++++++++ test/CodeGen/R600/fmin3.ll | 38 +++++++++ test/CodeGen/R600/max3.ll | 41 +++++++++ test/CodeGen/R600/min3.ll | 111 +++++++++++++++++++++++++ 10 files changed, 363 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/R600/fmax3.ll create mode 100644 test/CodeGen/R600/fmin3.ll create mode 100644 test/CodeGen/R600/max3.ll create mode 100644 test/CodeGen/R600/min3.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index f153991b842..83083786fe8 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -2374,6 +2374,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN_LEGACY) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(FMAX3) + NODE_NAME_CASE(SMAX3) + NODE_NAME_CASE(UMAX3) + NODE_NAME_CASE(FMIN3) + NODE_NAME_CASE(SMIN3) + NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index e848d2cfb67..98850db56f1 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -210,6 +210,12 @@ enum { FMIN_LEGACY, SMIN, UMIN, + FMAX3, + SMAX3, + UMAX3, + FMIN3, + SMIN3, + UMIN3, URECIP, DIV_SCALE, DIV_FMAS, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 037767d1087..4ee0f2b31cc 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -84,7 +84,7 @@ def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, [SDNPAssociative] >; -// out = min(a, b) a snd b are signed ints +// out = min(a, b) a and b are signed ints def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; @@ -94,6 +94,37 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp, [SDNPCommutative, SDNPAssociative] >; +// FIXME: TableGen doesn't like commutative instructions with more +// than 2 operands. +// out = max(a, b, c) a, b and c are floats +def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b, and c are signed ints +def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = max(a, b, c) a, b and c are unsigned ints +def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are floats +def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b, c) a, b and c are signed ints +def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; + +// out = min(a, b) a and b are unsigned ints +def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, + [/*SDNPCommutative, SDNPAssociative*/] +>; def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0", SDTIntToFPOp, []>; diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index d8e424936da..f5654a7aabc 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -231,6 +231,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -1314,6 +1316,61 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); } +static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { + switch (Opc) { + case ISD::FMAXNUM: + return AMDGPUISD::FMAX3; + case AMDGPUISD::SMAX: + return AMDGPUISD::SMAX3; + case AMDGPUISD::UMAX: + return AMDGPUISD::UMAX3; + case ISD::FMINNUM: + return AMDGPUISD::FMIN3; + case AMDGPUISD::SMIN: + return AMDGPUISD::SMIN3; + case AMDGPUISD::UMIN: + return AMDGPUISD::UMIN3; + default: + llvm_unreachable("Not a min/max opcode"); + } +} + +SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Only do this if the inner op has one use since this will just increases + // register pressure for no benefit. + + // max(max(a, b), c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // max(a, max(b, c)) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } + + return SDValue(); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -1341,6 +1398,17 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } break; } + case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMINNUM: + case AMDGPUISD::SMAX: + case AMDGPUISD::SMIN: + case AMDGPUISD::UMAX: + case AMDGPUISD::UMIN: { + if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && + getTargetMachine().getOptLevel() > CodeGenOpt::None) + return performMin3Max3Combine(N, DCI); + break; + } case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 0ba89e8d717..7bf406e5af9 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -59,6 +59,8 @@ class SITargetLowering : public AMDGPUTargetLowering { unsigned AS, DAGCombinerInfo &DCI) const; + SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + public: SITargetLowering(TargetMachine &tm); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 6dcb6de0797..f6c62d22cab 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1573,15 +1573,27 @@ defm V_ALIGNBYTE_B32 : VOP3Inst , "v_alignbyte_b32", >; defm V_MULLIT_F32 : VOP3Inst , "v_mullit_f32", VOP_F32_F32_F32_F32>; -////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "v_min3_f32", []>; -////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "v_min3_i32", []>; -////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "v_min3_u32", []>; -////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "v_max3_f32", []>; -////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "v_max3_i32", []>; -////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "v_max3_u32", []>; -////def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; -////def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; -////def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; +defm V_MIN3_F32 : VOP3Inst , "v_min3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmin3>; + +defm V_MIN3_I32 : VOP3Inst , "v_min3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmin3 +>; +defm V_MIN3_U32 : VOP3Inst , "v_min3_u32", + VOP_I32_I32_I32_I32, AMDGPUumin3 +>; +defm V_MAX3_F32 : VOP3Inst , "v_max3_f32", + VOP_F32_F32_F32_F32, AMDGPUfmax3 +>; +defm V_MAX3_I32 : VOP3Inst , "v_max3_i32", + VOP_I32_I32_I32_I32, AMDGPUsmax3 +>; +defm V_MAX3_U32 : VOP3Inst , "v_max3_u32", + VOP_I32_I32_I32_I32, AMDGPUumax3 +>; +//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>; +//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>; +//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>; diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll new file mode 100644 index 00000000000..cf371b35856 --- /dev/null +++ b/test/CodeGen/R600/fmax3.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.maxnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmax3_olt_0: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmax +; SI-LABEL: {{^}}test_fmax3_olt_1: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll new file mode 100644 index 00000000000..7420368883f --- /dev/null +++ b/test/CodeGen/R600/fmin3.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +declare float @llvm.minnum.f32(float, float) nounwind readnone + +; SI-LABEL: {{^}}test_fmin3_olt_0: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} + +; Commute operand of second fmin +; SI-LABEL: {{^}}test_fmin3_olt_1: +; SI: buffer_load_dword [[REGA:v[0-9]+]] +; SI: buffer_load_dword [[REGB:v[0-9]+]] +; SI: buffer_load_dword [[REGC:v[0-9]+]] +; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI: buffer_store_dword [[RESULT]], +; SI: s_endpgm +define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind { + %a = load float addrspace(1)* %aptr, align 4 + %b = load float addrspace(1)* %bptr, align 4 + %c = load float addrspace(1)* %cptr, align 4 + %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone + %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone + store float %f1, float addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll new file mode 100644 index 00000000000..74b08f675d8 --- /dev/null +++ b/test/CodeGen/R600/max3.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imax3_sgt_i32 +; SI: v_max3_i32 +define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp sgt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp sgt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umax3_ugt_i32 +; SI: v_max3_u32 +define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ugt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ugt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll new file mode 100644 index 00000000000..f852cffb2d4 --- /dev/null +++ b/test/CodeGen/R600/min3.ll @@ -0,0 +1,111 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare i32 @llvm.r600.read.tidig.x() nounwind readnone + +; FUNC-LABEL: @v_test_imin3_slt_i32 +; SI: v_min3_i32 +define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp slt i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_ult_i32 +; SI: v_min3_u32 +define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %icmp0 = icmp ult i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + %icmp1 = icmp ult i32 %i0, %c + %i1 = select i1 %icmp1, i32 %i0, i32 %c + store i32 %i1, i32 addrspace(1)* %outgep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin_umin_umin +; SI: v_min_i32 +; SI: v_min3_i32 +define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %d = load i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %i1 + %i2 = select i1 %icmp2, i32 %i0, i32 %i1 + + store i32 %i2, i32 addrspace(1)* %outgep1, align 4 + ret void +} + +; FUNC-LABEL: @v_test_umin3_2_uses +; SI-NOT: v_min3 +define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind { + %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %tid2 = mul i32 %tid, 2 + %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid + + %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2 + %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2 + %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2 + + %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid + %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2 + + %a = load i32 addrspace(1)* %gep0, align 4 + %b = load i32 addrspace(1)* %gep1, align 4 + %c = load i32 addrspace(1)* %gep2, align 4 + %d = load i32 addrspace(1)* %gep3, align 4 + + %icmp0 = icmp slt i32 %a, %b + %i0 = select i1 %icmp0, i32 %a, i32 %b + + %icmp1 = icmp slt i32 %c, %d + %i1 = select i1 %icmp1, i32 %c, i32 %d + + %icmp2 = icmp slt i32 %i0, %c + %i2 = select i1 %icmp2, i32 %i0, i32 %c + + store i32 %i2, i32 addrspace(1)* %outgep0, align 4 + store i32 %i0, i32 addrspace(1)* %outgep1, align 4 + ret void +}