Implement Neon VST[234] operations.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78330 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bob Wilson 2009-08-06 18:47:44 +00:00
parent 004f7c7049
commit b36ec86c01
8 changed files with 290 additions and 31 deletions

View File

@ -1306,11 +1306,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
}
case ARMISD::VLD2D: {
MVT VT = Op.getValueType();
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc;
unsigned Opc = 0;
MVT VT = Op.getValueType();
switch (VT.getSimpleVT()) {
default: llvm_unreachable("unhandled VLD2D type");
case MVT::v8i8: Opc = ARM::VLD2d8; break;
@ -1323,11 +1323,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
}
case ARMISD::VLD3D: {
MVT VT = Op.getValueType();
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc;
unsigned Opc = 0;
MVT VT = Op.getValueType();
switch (VT.getSimpleVT()) {
default: llvm_unreachable("unhandled VLD3D type");
case MVT::v8i8: Opc = ARM::VLD3d8; break;
@ -1340,11 +1340,11 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
}
case ARMISD::VLD4D: {
MVT VT = Op.getValueType();
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc;
unsigned Opc = 0;
MVT VT = Op.getValueType();
switch (VT.getSimpleVT()) {
default: llvm_unreachable("unhandled VLD4D type");
case MVT::v8i8: Opc = ARM::VLD4d8; break;
@ -1357,6 +1357,59 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
ResTys.push_back(MVT::Other);
return CurDAG->getTargetNode(Opc, dl, ResTys, Ops, 3);
}
case ARMISD::VST2D: {
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc = 0;
switch (N->getOperand(2).getValueType().getSimpleVT()) {
default: llvm_unreachable("unhandled VST2D type");
case MVT::v8i8: Opc = ARM::VST2d8; break;
case MVT::v4i16: Opc = ARM::VST2d16; break;
case MVT::v2f32:
case MVT::v2i32: Opc = ARM::VST2d32; break;
}
const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
N->getOperand(2), N->getOperand(3) };
return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 5);
}
case ARMISD::VST3D: {
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc = 0;
switch (N->getOperand(2).getValueType().getSimpleVT()) {
default: llvm_unreachable("unhandled VST3D type");
case MVT::v8i8: Opc = ARM::VST3d8; break;
case MVT::v4i16: Opc = ARM::VST3d16; break;
case MVT::v2f32:
case MVT::v2i32: Opc = ARM::VST3d32; break;
}
const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
N->getOperand(2), N->getOperand(3),
N->getOperand(4) };
return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 6);
}
case ARMISD::VST4D: {
SDValue MemAddr, MemUpdate, MemOpc;
if (!SelectAddrMode6(Op, N->getOperand(1), MemAddr, MemUpdate, MemOpc))
return NULL;
unsigned Opc = 0;
switch (N->getOperand(2).getValueType().getSimpleVT()) {
default: llvm_unreachable("unhandled VST4D type");
case MVT::v8i8: Opc = ARM::VST4d8; break;
case MVT::v4i16: Opc = ARM::VST4d16; break;
case MVT::v2f32:
case MVT::v2i32: Opc = ARM::VST4d32; break;
}
const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc,
N->getOperand(2), N->getOperand(3),
N->getOperand(4), N->getOperand(5) };
return CurDAG->getTargetNode(Opc, dl, MVT::Other, Ops, 7);
}
}
return SelectCode(Op);

View File

@ -323,6 +323,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
@ -466,6 +467,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VLD2D: return "ARMISD::VLD2D";
case ARMISD::VLD3D: return "ARMISD::VLD3D";
case ARMISD::VLD4D: return "ARMISD::VLD4D";
case ARMISD::VST2D: return "ARMISD::VST2D";
case ARMISD::VST3D: return "ARMISD::VST3D";
case ARMISD::VST4D: return "ARMISD::VST4D";
}
}
@ -1325,6 +1329,23 @@ static SDValue LowerNeonVLDIntrinsic(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(Opcode, dl, Node->getVTList(), Ops, 2);
}
static SDValue LowerNeonVSTIntrinsic(SDValue Op, SelectionDAG &DAG,
unsigned Opcode, unsigned NumVecs) {
SDNode *Node = Op.getNode();
MVT VT = Node->getOperand(3).getValueType();
DebugLoc dl = Op.getDebugLoc();
if (!VT.is64BitVector())
return SDValue(); // unimplemented
SmallVector<SDValue, 6> Ops;
Ops.push_back(Node->getOperand(0));
Ops.push_back(Node->getOperand(2));
for (unsigned N = 0; N < NumVecs; ++N)
Ops.push_back(Node->getOperand(N + 3));
return DAG.getNode(Opcode, dl, MVT::Other, Ops.data(), Ops.size());
}
SDValue
ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@ -1340,10 +1361,13 @@ ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD4D);
case Intrinsic::arm_neon_vst2i:
case Intrinsic::arm_neon_vst2f:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST2D, 2);
case Intrinsic::arm_neon_vst3i:
case Intrinsic::arm_neon_vst3f:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST3D, 3);
case Intrinsic::arm_neon_vst4i:
case Intrinsic::arm_neon_vst4f:
return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST4D, 4);
default: return SDValue(); // Don't custom lower most intrinsics.
}
}
@ -2381,6 +2405,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
case ISD::RETURNADDR: break;
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG);

View File

@ -119,7 +119,10 @@ namespace llvm {
// Vector load/store with (de)interleaving
VLD2D,
VLD3D,
VLD4D
VLD4D,
VST2D,
VST3D,
VST4D
};
}

View File

@ -81,6 +81,20 @@ def NEONvld3d : SDNode<"ARMISD::VLD3D", SDTARMVLD3,
def NEONvld4d : SDNode<"ARMISD::VLD4D", SDTARMVLD4,
[SDNPHasChain, SDNPMayLoad]>;
def SDTARMVST2 : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>]>;
def SDTARMVST3 : SDTypeProfile<0, 4, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
SDTCisSameAs<1, 3>]>;
def SDTARMVST4 : SDTypeProfile<0, 5, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
SDTCisSameAs<1, 3>,
SDTCisSameAs<1, 4>]>;
def NEONvst2d : SDNode<"ARMISD::VST2D", SDTARMVST2,
[SDNPHasChain, SDNPMayStore]>;
def NEONvst3d : SDNode<"ARMISD::VST3D", SDTARMVST3,
[SDNPHasChain, SDNPMayStore]>;
def NEONvst4d : SDNode<"ARMISD::VST4D", SDTARMVST4,
[SDNPHasChain, SDNPMayStore]>;
//===----------------------------------------------------------------------===//
// NEON operand definitions
//===----------------------------------------------------------------------===//
@ -172,30 +186,6 @@ def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1i>;
def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1f>;
def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1i>;
// VST1 : Vector Store (multiple single elements)
class VST1D<string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: NLdSt<(outs), (ins addrmode6:$addr, DPR:$src),
NoItinerary,
!strconcat(OpcodeStr, "\t\\{$src\\}, $addr"),
[(IntOp addrmode6:$addr, (Ty DPR:$src))]>;
class VST1Q<string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: NLdSt<(outs), (ins addrmode6:$addr, QPR:$src),
NoItinerary,
!strconcat(OpcodeStr, "\t${src:dregpair}, $addr"),
[(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>;
def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>;
def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>;
def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>;
def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>;
def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>;
def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>;
def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>;
def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>;
def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>;
// VLD2 : Vector Load (multiple 2-element structures)
class VLD2D<string OpcodeStr>
: NLdSt<(outs DPR:$dst1, DPR:$dst2), (ins addrmode6:$addr),
@ -227,6 +217,59 @@ def VLD4d8 : VLD4D<"vld4.8">;
def VLD4d16 : VLD4D<"vld4.16">;
def VLD4d32 : VLD4D<"vld4.32">;
// VST1 : Vector Store (multiple single elements)
class VST1D<string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: NLdSt<(outs), (ins addrmode6:$addr, DPR:$src),
NoItinerary,
!strconcat(OpcodeStr, "\t\\{$src\\}, $addr"),
[(IntOp addrmode6:$addr, (Ty DPR:$src))]>;
class VST1Q<string OpcodeStr, ValueType Ty, Intrinsic IntOp>
: NLdSt<(outs), (ins addrmode6:$addr, QPR:$src),
NoItinerary,
!strconcat(OpcodeStr, "\t${src:dregpair}, $addr"),
[(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>;
def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>;
def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>;
def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>;
def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>;
def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>;
def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>;
def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>;
def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>;
def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>;
// VST2 : Vector Store (multiple 2-element structures)
class VST2D<string OpcodeStr>
: NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2), NoItinerary,
!strconcat(OpcodeStr, "\t\\{$src1,$src2\\}, $addr"), []>;
def VST2d8 : VST2D<"vst2.8">;
def VST2d16 : VST2D<"vst2.16">;
def VST2d32 : VST2D<"vst2.32">;
// VST3 : Vector Store (multiple 3-element structures)
class VST3D<string OpcodeStr>
: NLdSt<(outs), (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
NoItinerary,
!strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3\\}, $addr"), []>;
def VST3d8 : VST3D<"vst3.8">;
def VST3d16 : VST3D<"vst3.16">;
def VST3d32 : VST3D<"vst3.32">;
// VST4 : Vector Store (multiple 4-element structures)
class VST4D<string OpcodeStr>
: NLdSt<(outs), (ins addrmode6:$addr,
DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4), NoItinerary,
!strconcat(OpcodeStr, "\t\\{$src1,$src2,$src3,$src4\\}, $addr"), []>;
def VST4d8 : VST4D<"vst4.8">;
def VST4d16 : VST4D<"vst4.16">;
def VST4d32 : VST4D<"vst4.32">;
//===----------------------------------------------------------------------===//
// NEON pattern fragments

View File

@ -62,6 +62,27 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd,
FirstOpnd = 0;
NumRegs = 4;
return true;
case ARM::VST2d8:
case ARM::VST2d16:
case ARM::VST2d32:
FirstOpnd = 3;
NumRegs = 2;
return true;
case ARM::VST3d8:
case ARM::VST3d16:
case ARM::VST3d32:
FirstOpnd = 3;
NumRegs = 3;
return true;
case ARM::VST4d8:
case ARM::VST4d16:
case ARM::VST4d32:
FirstOpnd = 3;
NumRegs = 4;
return true;
}
return false;

38
test/CodeGen/ARM/vst2.ll Normal file
View File

@ -0,0 +1,38 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst2i8:
;CHECK: vst2.8
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst2i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1)
ret void
}
define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2i16:
;CHECK: vst2.16
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst2i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1)
ret void
}
define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst2i32:
;CHECK: vst2.32
%tmp1 = load <2 x i32>* %B
call void @llvm.arm.neon.vst2i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1)
ret void
}
define void @vst2f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst2f:
;CHECK: vst2.32
%tmp1 = load <2 x float>* %B
call void @llvm.arm.neon.vst2f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1)
ret void
}
declare void @llvm.arm.neon.vst2i.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
declare void @llvm.arm.neon.vst2i.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind
declare void @llvm.arm.neon.vst2i.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind
declare void @llvm.arm.neon.vst2f.v2f32(i8*, <2 x float>, <2 x float>) nounwind

38
test/CodeGen/ARM/vst3.ll Normal file
View File

@ -0,0 +1,38 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst3i8:
;CHECK: vst3.8
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst3i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
ret void
}
define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst3i16:
;CHECK: vst3.16
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst3i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
ret void
}
define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst3i32:
;CHECK: vst3.32
%tmp1 = load <2 x i32>* %B
call void @llvm.arm.neon.vst3i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
ret void
}
define void @vst3f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst3f:
;CHECK: vst3.32
%tmp1 = load <2 x float>* %B
call void @llvm.arm.neon.vst3f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
ret void
}
declare void @llvm.arm.neon.vst3i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
declare void @llvm.arm.neon.vst3i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
declare void @llvm.arm.neon.vst3i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
declare void @llvm.arm.neon.vst3f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind

38
test/CodeGen/ARM/vst4.ll Normal file
View File

@ -0,0 +1,38 @@
; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s
define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
;CHECK: vst4i8:
;CHECK: vst4.8
%tmp1 = load <8 x i8>* %B
call void @llvm.arm.neon.vst4i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
ret void
}
define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4i16:
;CHECK: vst4.16
%tmp1 = load <4 x i16>* %B
call void @llvm.arm.neon.vst4i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
ret void
}
define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst4i32:
;CHECK: vst4.32
%tmp1 = load <2 x i32>* %B
call void @llvm.arm.neon.vst4i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
ret void
}
define void @vst4f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst4f:
;CHECK: vst4.32
%tmp1 = load <2 x float>* %B
call void @llvm.arm.neon.vst4f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
ret void
}
declare void @llvm.arm.neon.vst4i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
declare void @llvm.arm.neon.vst4i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
declare void @llvm.arm.neon.vst4i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
declare void @llvm.arm.neon.vst4f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind