[PowerPC] Add vec_vsx_ld and vec_vsx_st intrinsics

This patch enables the vec_vsx_ld and vec_vsx_st intrinsics for
PowerPC, which provide programmer access to the lxvd2x, lxvw4x,
stxvd2x, and stxvw4x instructions.

New LLVM intrinsics are provided to represent these four instructions
in IntrinsicsPowerPC.td.  These are patterned after the similar
intrinsics for lvx and stvx (Altivec).  In PPCInstrVSX.td, these
intrinsics are tied to the code gen patterns, with additional patterns
to allow plain vanilla loads and stores to still generate these
instructions.

At -O1 and higher the intrinsics are immediately converted to loads
and stores in InstCombineCalls.cpp.  This will open up more
optimization opportunities while still allowing the correct
instructions to be generated.  (Similar code exists for aligned
Altivec loads and stores.)

The new intrinsics are added to the code that checks for consecutive
loads and stores in PPCISelLowering.cpp, as well as to
PPCTargetLowering::getTgtMemIntrinsic().

There's a new test to verify the correct instructions are generated.
The loads and stores tend to be reordered, so the test just counts
their number.  It runs at -O2, as it's not very effective to test this
at -O0, when many unnecessary loads and stores are generated.

I ended up having to modify vsx-fma-m.ll.  It turns out this test case
is slightly unreliable, but I don't know a good way to prevent
problems with it.  The xvmaddmdp instructions read and write the same
register, which is one of the multiplicands.  Commutativity allows
either to be chosen.  If the FMAs are reordered differently than
expected by the test, the register assignment can be different as a
result.  Hopefully this doesn't change often.

There is a companion patch for Clang.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221767 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bill Schmidt 2014-11-12 04:19:40 +00:00
parent d0518569ec
commit fc22bfd921
7 changed files with 145 additions and 12 deletions

View File

@ -516,6 +516,18 @@ def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;
let TargetPrefix = "ppc" in { // All intrinsics start with "llvm.ppc.".
// Vector load.
def int_ppc_vsx_lxvw4x :
Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
def int_ppc_vsx_lxvd2x :
Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
// Vector store.
def int_ppc_vsx_stxvw4x :
Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
def int_ppc_vsx_stxvd2x :
Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
// Vector maximum.
def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;

View File

@ -7583,8 +7583,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
default: return false;
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_vsx_lxvw4x:
VT = MVT::v4i32;
break;
case Intrinsic::ppc_vsx_lxvd2x:
VT = MVT::v2f64;
break;
case Intrinsic::ppc_altivec_lvebx:
VT = MVT::i8;
break;
@ -7605,8 +7609,12 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
default: return false;
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_vsx_stxvw4x:
VT = MVT::v4i32;
break;
case Intrinsic::ppc_vsx_stxvd2x:
VT = MVT::v2f64;
break;
case Intrinsic::ppc_altivec_stvebx:
VT = MVT::i8;
break;
@ -9094,7 +9102,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
case Intrinsic::ppc_altivec_lvehx:
case Intrinsic::ppc_altivec_lvewx: {
case Intrinsic::ppc_altivec_lvewx:
case Intrinsic::ppc_vsx_lxvd2x:
case Intrinsic::ppc_vsx_lxvw4x: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_lvebx:
@ -9106,6 +9116,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::ppc_altivec_lvewx:
VT = MVT::i32;
break;
case Intrinsic::ppc_vsx_lxvd2x:
VT = MVT::v2f64;
break;
default:
VT = MVT::v4i32;
break;
@ -9126,7 +9139,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::ppc_altivec_stvxl:
case Intrinsic::ppc_altivec_stvebx:
case Intrinsic::ppc_altivec_stvehx:
case Intrinsic::ppc_altivec_stvewx: {
case Intrinsic::ppc_altivec_stvewx:
case Intrinsic::ppc_vsx_stxvd2x:
case Intrinsic::ppc_vsx_stxvw4x: {
EVT VT;
switch (Intrinsic) {
case Intrinsic::ppc_altivec_stvebx:
@ -9138,6 +9153,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::ppc_altivec_stvewx:
VT = MVT::i32;
break;
case Intrinsic::ppc_vsx_stxvd2x:
VT = MVT::v2f64;
break;
default:
VT = MVT::v4i32;
break;

View File

@ -55,7 +55,7 @@ let Uses = [RM] in {
def LXVD2X : XX1Form<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
"lxvd2x $XT, $src", IIC_LdStLFD,
[(set v2f64:$XT, (load xoaddr:$src))]>;
[(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
def LXVDSX : XX1Form<31, 332,
(outs vsrc:$XT), (ins memrr:$src),
@ -64,7 +64,7 @@ let Uses = [RM] in {
def LXVW4X : XX1Form<31, 780,
(outs vsrc:$XT), (ins memrr:$src),
"lxvw4x $XT, $src", IIC_LdStLFD,
[(set v4i32:$XT, (load xoaddr:$src))]>;
[(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
}
// Store indexed instructions
@ -77,12 +77,12 @@ let Uses = [RM] in {
def STXVD2X : XX1Form<31, 972,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvd2x $XT, $dst", IIC_LdStSTFD,
[(store v2f64:$XT, xoaddr:$dst)]>;
[(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
def STXVW4X : XX1Form<31, 908,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvw4x $XT, $dst", IIC_LdStSTFD,
[(store v4i32:$XT, xoaddr:$dst)]>;
[(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
}
// Add/Mul Instructions
@ -851,11 +851,14 @@ def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
(XVCVSXWDP (XXSLDWI $C, $C, 1))>;
// Loads.
def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
// Stores.
def : Pat<(store v4i32:$rS, xoaddr:$dst),
(STXVW4X $rS, xoaddr:$dst)>;
def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
// Selects.
def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),

View File

@ -613,6 +613,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return new LoadInst(Ptr);
}
break;
case Intrinsic::ppc_vsx_lxvw4x:
case Intrinsic::ppc_vsx_lxvd2x: {
// Turn PPC VSX loads into normal loads.
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
PointerType::getUnqual(II->getType()));
return new LoadInst(Ptr, Twine(""), false, 1);
}
case Intrinsic::ppc_altivec_stvx:
case Intrinsic::ppc_altivec_stvxl:
// Turn stvx -> store if the pointer is known aligned.
@ -624,6 +631,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return new StoreInst(II->getArgOperand(0), Ptr);
}
break;
case Intrinsic::ppc_vsx_stxvw4x:
case Intrinsic::ppc_vsx_stxvd2x: {
// Turn PPC VSX stores into normal stores.
Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
}
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:

View File

@ -177,21 +177,27 @@ entry:
store <2 x double> %1, <2 x double>* %arrayidx3, align 8
ret void
; Note: There is some unavoidable changeability in this variant. If the
; FMAs are reordered differently, the algorithm can pick a different
; multiplicand to destroy, changing the register assignment. There isn't
; a good way to express this possibility, so hopefully this doesn't change
; too often.
; CHECK-LABEL: @testv3
; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
; CHECK-DAG: xvmaddmdp 37, 35, 34
; CHECK-DAG: li [[C1:[0-9]+]], 48
; CHECK-DAG: li [[C2:[0-9]+]], 32
; CHECK-DAG: xvmaddadp 34, 35, 38
; CHECK-DAG: xvmaddmdp 37, 35, 34
; CHECK-DAG: li [[C3:[0-9]+]], 16
; Note: We could convert this next FMA to M-type as well, but it would require
; re-ordering the instructions.
; CHECK-DAG: xvmaddadp [[V1]], 35, 36
; CHECK-DAG: xvmaddmdp 35, 36, 37
; CHECK-DAG: xvmaddmdp 36, 35, 37
; CHECK-DAG: xvmaddadp 34, 35, 38
; CHECK-DAG: stxvd2x 32, 0, 3
; CHECK-DAG: stxvd2x 35, 3, [[C1]]
; CHECK-DAG: stxvd2x 36, 3, [[C1]]
; CHECK-DAG: stxvd2x 34, 3, [[C2]]
; CHECK-DAG: stxvd2x 37, 3, [[C3]]
; CHECK: blr

View File

@ -0,0 +1,36 @@
; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
; RUN: grep lxvw4x < %t | count 3
; RUN: grep lxvd2x < %t | count 3
; RUN: grep stxvw4x < %t | count 3
; RUN: grep stxvd2x < %t | count 3
@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
@res_vsi = common global <4 x i32> zeroinitializer, align 16
@res_vui = common global <4 x i32> zeroinitializer, align 16
@res_vf = common global <4 x float> zeroinitializer, align 16
@res_vsll = common global <2 x i64> zeroinitializer, align 16
@res_vull = common global <2 x i64> zeroinitializer, align 16
@res_vd = common global <2 x double> zeroinitializer, align 16
; Function Attrs: nounwind
define void @test1() {
entry:
%0 = load <4 x i32>* @vsi, align 16
%1 = load <4 x i32>* @vui, align 16
%2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
%3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
%4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
%5 = load <2 x double>* @vd, align 16
store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
store <4 x i32> %1, <4 x i32>* @res_vui, align 16
store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
store <2 x double> %5, <2 x double>* @res_vd, align 16
ret void
}

View File

@ -0,0 +1,44 @@
; Verify that we can create unaligned loads and stores from VSX intrinsics.
; RUN: opt < %s -instcombine -S | FileCheck %s
target triple = "powerpc64-unknown-linux-gnu"
@vf = common global <4 x float> zeroinitializer, align 1
@res_vf = common global <4 x float> zeroinitializer, align 1
@vd = common global <2 x double> zeroinitializer, align 1
@res_vd = common global <2 x double> zeroinitializer, align 1
define void @test1() {
entry:
%t1 = alloca <4 x float>*, align 8
%t2 = alloca <2 x double>*, align 8
store <4 x float>* @vf, <4 x float>** %t1, align 8
%0 = load <4 x float>** %t1, align 8
%1 = bitcast <4 x float>* %0 to i8*
%2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
store <4 x float>* @res_vf, <4 x float>** %t1, align 8
%3 = load <4 x float>** %t1, align 8
%4 = bitcast <4 x float>* %3 to i8*
call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
store <2 x double>* @vd, <2 x double>** %t2, align 8
%5 = load <2 x double>** %t2, align 8
%6 = bitcast <2 x double>* %5 to i8*
%7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
store <2 x double>* @res_vd, <2 x double>** %t2, align 8
%8 = load <2 x double>** %t2, align 8
%9 = bitcast <2 x double>* %8 to i8*
call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
ret void
}
; CHECK-LABEL: @test1
; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
; CHECK: %1 = load <2 x double>* @vd, align 1
; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)