diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td index 5f8cda5e5f4..43ee7d27da7 100644 --- a/include/llvm/IR/IntrinsicsPowerPC.td +++ b/include/llvm/IR/IntrinsicsPowerPC.td @@ -516,6 +516,18 @@ def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">; let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.". +// Vector load. +def int_ppc_vsx_lxvw4x : + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>; +def int_ppc_vsx_lxvd2x : + Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>; + +// Vector store. +def int_ppc_vsx_stxvw4x : + Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>; +def int_ppc_vsx_stxvd2x : + Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>; + // Vector maximum. def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">; def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 0984e60cd2c..0c844be3c55 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7583,8 +7583,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, default: return false; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_vsx_lxvw4x: VT = MVT::v4i32; break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; case Intrinsic::ppc_altivec_lvebx: VT = MVT::i8; break; @@ -7605,8 +7609,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, default: return false; case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_vsx_stxvw4x: VT = MVT::v4i32; break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; case Intrinsic::ppc_altivec_stvebx: VT = MVT::i8; break; @@ -9094,7 +9102,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: case Intrinsic::ppc_altivec_lvehx: - case Intrinsic::ppc_altivec_lvewx: { + case Intrinsic::ppc_altivec_lvewx: + case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_lvebx: @@ -9106,6 +9116,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_altivec_lvewx: VT = MVT::i32; break; + case Intrinsic::ppc_vsx_lxvd2x: + VT = MVT::v2f64; + break; default: VT = MVT::v4i32; break; @@ -9126,7 +9139,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: case Intrinsic::ppc_altivec_stvehx: - case Intrinsic::ppc_altivec_stvewx: { + case Intrinsic::ppc_altivec_stvewx: + case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvw4x: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_stvebx: @@ -9138,6 +9153,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::ppc_altivec_stvewx: VT = MVT::i32; break; + case Intrinsic::ppc_vsx_stxvd2x: + VT = MVT::v2f64; + break; default: VT = MVT::v4i32; break; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 3dfdf0806e3..522e0de7386 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -55,7 +55,7 @@ let Uses = [RM] in { def LXVD2X : XX1Form<31, 844, (outs vsrc:$XT), (ins memrr:$src), "lxvd2x $XT, $src", IIC_LdStLFD, - [(set v2f64:$XT, (load xoaddr:$src))]>; + [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>; def LXVDSX : XX1Form<31, 332, (outs vsrc:$XT), (ins memrr:$src), @@ -64,7 +64,7 @@ let Uses = [RM] in { def LXVW4X : XX1Form<31, 780, (outs vsrc:$XT), (ins memrr:$src), "lxvw4x $XT, $src", IIC_LdStLFD, - [(set v4i32:$XT, (load xoaddr:$src))]>; + [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>; } // Store indexed instructions @@ -77,12 +77,12 @@ let Uses = [RM] in { def STXVD2X : XX1Form<31, 972, (outs), (ins vsrc:$XT, memrr:$dst), "stxvd2x $XT, $dst", IIC_LdStSTFD, - [(store v2f64:$XT, xoaddr:$dst)]>; + [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>; def STXVW4X : XX1Form<31, 908, (outs), (ins vsrc:$XT, memrr:$dst), "stxvw4x $XT, $dst", IIC_LdStSTFD, - [(store v4i32:$XT, xoaddr:$dst)]>; + [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>; } // Add/Mul Instructions @@ -851,11 +851,14 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), (XVCVSXWDP (XXSLDWI $C, $C, 1))>; // Loads. +def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; +def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>; // Stores. -def : Pat<(store v4i32:$rS, xoaddr:$dst), - (STXVW4X $rS, xoaddr:$dst)>; +def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; +def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>; // Selects. def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)), diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 8987ee0e848..87e49a11e6e 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -613,6 +613,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new LoadInst(Ptr); } break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: { + // Turn PPC VSX loads into normal loads. + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr, Twine(""), false, 1); + } case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: // Turn stvx -> store if the pointer is known aligned. @@ -624,6 +631,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return new StoreInst(II->getArgOperand(0), Ptr); } break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: { + // Turn PPC VSX stores into normal stores. + Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr, false, 1); + } case Intrinsic::x86_sse_storeu_ps: case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll index da4a20481e6..9dff9a755ad 100644 --- a/test/CodeGen/PowerPC/vsx-fma-m.ll +++ b/test/CodeGen/PowerPC/vsx-fma-m.ll @@ -177,21 +177,27 @@ entry: store <2 x double> %1, <2 x double>* %arrayidx3, align 8 ret void +; Note: There is some unavoidable changeability in this variant. If the +; FMAs are reordered differently, the algorithm can pick a different +; multiplicand to destroy, changing the register assignment. There isn't +; a good way to express this possibility, so hopefully this doesn't change +; too often. + ; CHECK-LABEL: @testv3 ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34 -; CHECK-DAG: xvmaddmdp 37, 35, 34 ; CHECK-DAG: li [[C1:[0-9]+]], 48 ; CHECK-DAG: li [[C2:[0-9]+]], 32 -; CHECK-DAG: xvmaddadp 34, 35, 38 +; CHECK-DAG: xvmaddmdp 37, 35, 34 ; CHECK-DAG: li [[C3:[0-9]+]], 16 ; Note: We could convert this next FMA to M-type as well, but it would require ; re-ordering the instructions. ; CHECK-DAG: xvmaddadp [[V1]], 35, 36 -; CHECK-DAG: xvmaddmdp 35, 36, 37 +; CHECK-DAG: xvmaddmdp 36, 35, 37 +; CHECK-DAG: xvmaddadp 34, 35, 38 ; CHECK-DAG: stxvd2x 32, 0, 3 -; CHECK-DAG: stxvd2x 35, 3, [[C1]] +; CHECK-DAG: stxvd2x 36, 3, [[C1]] ; CHECK-DAG: stxvd2x 34, 3, [[C2]] ; CHECK-DAG: stxvd2x 37, 3, [[C3]] ; CHECK: blr diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll new file mode 100644 index 00000000000..0c9ebef8757 --- /dev/null +++ b/test/CodeGen/PowerPC/vsx-ldst.ll @@ -0,0 +1,36 @@ +; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t +; RUN: grep lxvw4x < %t | count 3 +; RUN: grep lxvd2x < %t | count 3 +; RUN: grep stxvw4x < %t | count 3 +; RUN: grep stxvd2x < %t | count 3 + +@vsi = global <4 x i32> , align 16 +@vui = global <4 x i32> , align 16 +@vf = global <4 x float> , align 16 +@vsll = global <2 x i64> , align 16 +@vull = global <2 x i64> , align 16 +@vd = global <2 x double> , align 16 +@res_vsi = common global <4 x i32> zeroinitializer, align 16 +@res_vui = common global <4 x i32> zeroinitializer, align 16 +@res_vf = common global <4 x float> zeroinitializer, align 16 +@res_vsll = common global <2 x i64> zeroinitializer, align 16 +@res_vull = common global <2 x i64> zeroinitializer, align 16 +@res_vd = common global <2 x double> zeroinitializer, align 16 + +; Function Attrs: nounwind +define void @test1() { +entry: + %0 = load <4 x i32>* @vsi, align 16 + %1 = load <4 x i32>* @vui, align 16 + %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16 + %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16 + %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16 + %5 = load <2 x double>* @vd, align 16 + store <4 x i32> %0, <4 x i32>* @res_vsi, align 16 + store <4 x i32> %1, <4 x i32>* @res_vui, align 16 + store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16 + store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16 + store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16 + store <2 x double> %5, <2 x double>* @res_vd, align 16 + ret void +} diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/vsx-unaligned.ll new file mode 100644 index 00000000000..26e04268f44 --- /dev/null +++ b/test/Transforms/InstCombine/vsx-unaligned.ll @@ -0,0 +1,44 @@ +; Verify that we can create unaligned loads and stores from VSX intrinsics. + +; RUN: opt < %s -instcombine -S | FileCheck %s + +target triple = "powerpc64-unknown-linux-gnu" + +@vf = common global <4 x float> zeroinitializer, align 1 +@res_vf = common global <4 x float> zeroinitializer, align 1 +@vd = common global <2 x double> zeroinitializer, align 1 +@res_vd = common global <2 x double> zeroinitializer, align 1 + +define void @test1() { +entry: + %t1 = alloca <4 x float>*, align 8 + %t2 = alloca <2 x double>*, align 8 + store <4 x float>* @vf, <4 x float>** %t1, align 8 + %0 = load <4 x float>** %t1, align 8 + %1 = bitcast <4 x float>* %0 to i8* + %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1) + store <4 x float>* @res_vf, <4 x float>** %t1, align 8 + %3 = load <4 x float>** %t1, align 8 + %4 = bitcast <4 x float>* %3 to i8* + call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4) + store <2 x double>* @vd, <2 x double>** %t2, align 8 + %5 = load <2 x double>** %t2, align 8 + %6 = bitcast <2 x double>* %5 to i8* + %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6) + store <2 x double>* @res_vd, <2 x double>** %t2, align 8 + %8 = load <2 x double>** %t2, align 8 + %9 = bitcast <2 x double>* %8 to i8* + call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9) + ret void +} + +; CHECK-LABEL: @test1 +; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1 +; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1 +; CHECK: %1 = load <2 x double>* @vd, align 1 +; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1 + +declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*) +declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*) +declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*) +declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)