ARM64: use regalloc-friendly COPY_TO_REGCLASS for bitcasts

The previous patterns directly inserted FMOV or INS instructions into
the DAG for scalar_to_vector & bitconvert patterns. This is horribly
inefficient and can generated lots more GPR <-> FPR register traffic
than necessary.

It's much better to emit instructions the register allocator
understands so it can coalesce the copies when appropriate.

It led to at least one ISelLowering hack to avoid the problems, which
was incorrect for v1i64 (FPR64 has no dsub). It can now be removed
entirely.

This should also fix PR19331.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205616 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Tim Northover 2014-04-04 09:03:09 +00:00
parent 604dff27c9
commit 0eb313be18
3 changed files with 50 additions and 78 deletions

View File

@ -480,7 +480,6 @@ void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, VT.getSimpleVT(), Custom);
setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
@ -1973,8 +1972,6 @@ SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SCALAR_TO_VECTOR:
return LowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
@ -5577,53 +5574,6 @@ SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
Op.getOperand(1));
}
SDValue ARM64TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::SCALAR_TO_VECTOR && "Unknown opcode!");
// Some AdvSIMD intrinsics leave their results in the scalar B/H/S/D
// registers. The default lowering will copy those to a GPR then back
// to a vector register. Instead, just recognize those cases and reference
// the vector register they're already a subreg of.
SDValue Op0 = Op->getOperand(0);
if (Op0->getOpcode() != ISD::INTRINSIC_WO_CHAIN)
return Op;
unsigned IID = getIntrinsicID(Op0.getNode());
// The below list of intrinsics isn't exhaustive. Add cases as-needed.
// FIXME: Even better would be if there were an attribute on the node
// that we could query and set in the intrinsics definition or something.
unsigned SubIdx;
switch (IID) {
default:
// Early exit if this isn't one of the intrinsics we handle.
return Op;
case Intrinsic::arm64_neon_uaddv:
case Intrinsic::arm64_neon_saddv:
case Intrinsic::arm64_neon_uaddlv:
case Intrinsic::arm64_neon_saddlv:
switch (Op0.getValueType().getSizeInBits()) {
default:
llvm_unreachable("Illegal result size from ARM64 vector intrinsic!");
case 8:
SubIdx = ARM64::bsub;
break;
case 16:
SubIdx = ARM64::hsub;
break;
case 32:
SubIdx = ARM64::ssub;
break;
case 64:
SubIdx = ARM64::dsub;
break;
}
}
MachineSDNode *N =
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(Op),
Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
Op0, DAG.getTargetConstant(SubIdx, MVT::i32));
return SDValue(N, 0);
}
SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getOperand(0).getValueType();

View File

@ -1890,31 +1890,39 @@ defm FMOV : UnscaledConversion<"fmov">;
def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn,
FPR32)>;
def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), (COPY_TO_REGCLASS FPR32:$Xn,
GPR32)>;
def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), (COPY_TO_REGCLASS GPR64:$Xn,
FPR64)>;
def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), (COPY_TO_REGCLASS FPR64:$Xn,
GPR64)>;
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
(COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
(COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
//===----------------------------------------------------------------------===//
// Floating point conversion instruction.
@ -2971,16 +2979,18 @@ def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
defm INS : SIMDIns;
def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
(INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
(EXTRACT_SUBREG
(INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
(INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
(EXTRACT_SUBREG
(INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),

View File

@ -136,6 +136,18 @@ entry:
ret i64 %vaddv.i
}
define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
; CHECK-LABEL: test_vaddv_u64_to_vec:
; CHECK: addp.2d d0, v0
; CHECK-NOT: fmov
; CHECK-NOT: ins
; CHECK: ret
entry:
%vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
%vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
ret <1 x i64> %vec
}
define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
; CHECK-LABEL: test_vaddvq_s8:
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0