mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-17 03:30:28 +00:00
Break 256-bit vector int add/sub/mul into two 128-bit operations to avoid costly scalarization. Fixes PR10711.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138427 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3ae96d69ef
commit
13894fa135
@ -998,6 +998,21 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
||||
setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
|
||||
|
||||
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::ADD, MVT::v16i16, Custom);
|
||||
setOperationAction(ISD::ADD, MVT::v32i8, Custom);
|
||||
|
||||
setOperationAction(ISD::SUB, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::SUB, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::SUB, MVT::v16i16, Custom);
|
||||
setOperationAction(ISD::SUB, MVT::v32i8, Custom);
|
||||
|
||||
setOperationAction(ISD::MUL, MVT::v4i64, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v16i16, Custom);
|
||||
// Don't lower v32i8 because there is no 128-bit byte mul
|
||||
|
||||
// Custom lower several nodes for 256-bit types.
|
||||
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
|
||||
i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
|
||||
@ -9422,8 +9437,58 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
|
||||
return Op;
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
|
||||
// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
|
||||
// ones, and then concatenate the result back.
|
||||
static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
|
||||
"Unsupported value type for operation");
|
||||
|
||||
int NumElems = VT.getVectorNumElements();
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
SDValue Idx0 = DAG.getConstant(0, MVT::i32);
|
||||
SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
|
||||
|
||||
// Extract the LHS vectors
|
||||
SDValue LHS = Op.getOperand(0);
|
||||
SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
|
||||
SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
|
||||
|
||||
// Extract the RHS vectors
|
||||
SDValue RHS = Op.getOperand(1);
|
||||
SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
|
||||
SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
|
||||
|
||||
MVT EltVT = VT.getVectorElementType().getSimpleVT();
|
||||
EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
|
||||
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
|
||||
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(Op.getValueType().getSizeInBits() == 256 &&
|
||||
Op.getValueType().isInteger() &&
|
||||
"Only handle AVX 256-bit vector integer operation");
|
||||
return Lower256IntArith(Op, DAG);
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(Op.getValueType().getSizeInBits() == 256 &&
|
||||
Op.getValueType().isInteger() &&
|
||||
"Only handle AVX 256-bit vector integer operation");
|
||||
return Lower256IntArith(Op, DAG);
|
||||
}
|
||||
|
||||
SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
// Decompose 256-bit ops into smaller 128-bit ops.
|
||||
if (VT.getSizeInBits() == 256)
|
||||
return Lower256IntArith(Op, DAG);
|
||||
|
||||
assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
|
||||
@ -10013,7 +10078,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
|
||||
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
|
||||
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
|
||||
case ISD::MUL: return LowerMUL(Op, DAG);
|
||||
case ISD::SRA:
|
||||
case ISD::SRL:
|
||||
case ISD::SHL: return LowerShift(Op, DAG);
|
||||
@ -10029,6 +10094,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::ADDE:
|
||||
case ISD::SUBC:
|
||||
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
|
||||
case ISD::ADD: return LowerADD(Op, DAG);
|
||||
case ISD::SUB: return LowerSUB(Op, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -819,7 +819,9 @@ namespace llvm {
|
||||
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
|
@ -131,3 +131,131 @@ entry:
|
||||
}
|
||||
|
||||
declare float @sqrtf(float) readnone
|
||||
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
||||
%x = add <4 x i64> %i, %j
|
||||
ret <4 x i64> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpaddd %xmm
|
||||
; CHECK-NEXT: vpaddd %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
||||
%x = add <8 x i32> %i, %j
|
||||
ret <8 x i32> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpaddw %xmm
|
||||
; CHECK-NEXT: vpaddw %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
||||
%x = add <16 x i16> %i, %j
|
||||
ret <16 x i16> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpaddb %xmm
|
||||
; CHECK-NEXT: vpaddb %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
|
||||
%x = add <32 x i8> %i, %j
|
||||
ret <32 x i8> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpsubq %xmm
|
||||
; CHECK-NEXT: vpsubq %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
||||
%x = sub <4 x i64> %i, %j
|
||||
ret <4 x i64> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpsubd %xmm
|
||||
; CHECK-NEXT: vpsubd %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
||||
%x = sub <8 x i32> %i, %j
|
||||
ret <8 x i32> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpsubw %xmm
|
||||
; CHECK-NEXT: vpsubw %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
||||
%x = sub <16 x i16> %i, %j
|
||||
ret <16 x i16> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpsubb %xmm
|
||||
; CHECK-NEXT: vpsubb %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
|
||||
%x = sub <32 x i8> %i, %j
|
||||
ret <32 x i8> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpmulld %xmm
|
||||
; CHECK-NEXT: vpmulld %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
|
||||
%x = mul <8 x i32> %i, %j
|
||||
ret <8 x i32> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpmullw %xmm
|
||||
; CHECK-NEXT: vpmullw %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
|
||||
%x = mul <16 x i16> %i, %j
|
||||
ret <16 x i16> %x
|
||||
}
|
||||
|
||||
; CHECK: vextractf128 $1
|
||||
; CHECK-NEXT: vextractf128 $1
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsrlq $32, %xmm
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsllq $32, %xmm
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsrlq $32, %xmm
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsllq $32, %xmm
|
||||
; CHECK-NEXT: vpsrlq $32, %xmm
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsllq $32, %xmm
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vpsrlq $32, %xmm
|
||||
; CHECK-NEXT: vpmuludq %xmm
|
||||
; CHECK-NEXT: vpsllq $32, %xmm
|
||||
; CHECK-NEXT: vpaddq %xmm
|
||||
; CHECK-NEXT: vinsertf128 $1
|
||||
define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
|
||||
%x = mul <4 x i64> %i, %j
|
||||
ret <4 x i64> %x
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user