mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2024-12-28 19:31:58 +00:00
ARM64: use regalloc-friendly COPY_TO_REGCLASS for bitcasts
The previous patterns directly inserted FMOV or INS instructions into the DAG for scalar_to_vector & bitconvert patterns. This is horribly inefficient and can generated lots more GPR <-> FPR register traffic than necessary. It's much better to emit instructions the register allocator understands so it can coalesce the copies when appropriate. It led to at least one ISelLowering hack to avoid the problems, which was incorrect for v1i64 (FPR64 has no dsub). It can now be removed entirely. This should also fix PR19331. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205616 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
604dff27c9
commit
0eb313be18
@ -480,7 +480,6 @@ void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
|
||||
setOperationAction(ISD::SCALAR_TO_VECTOR, VT.getSimpleVT(), Custom);
|
||||
setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
|
||||
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
|
||||
@ -1973,8 +1972,6 @@ SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
|
||||
return LowerINSERT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::SCALAR_TO_VECTOR:
|
||||
return LowerSCALAR_TO_VECTOR(Op, DAG);
|
||||
case ISD::BUILD_VECTOR:
|
||||
return LowerBUILD_VECTOR(Op, DAG);
|
||||
case ISD::VECTOR_SHUFFLE:
|
||||
@ -5577,53 +5574,6 @@ SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
|
||||
Op.getOperand(1));
|
||||
}
|
||||
|
||||
SDValue ARM64TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
assert(Op.getOpcode() == ISD::SCALAR_TO_VECTOR && "Unknown opcode!");
|
||||
// Some AdvSIMD intrinsics leave their results in the scalar B/H/S/D
|
||||
// registers. The default lowering will copy those to a GPR then back
|
||||
// to a vector register. Instead, just recognize those cases and reference
|
||||
// the vector register they're already a subreg of.
|
||||
SDValue Op0 = Op->getOperand(0);
|
||||
if (Op0->getOpcode() != ISD::INTRINSIC_WO_CHAIN)
|
||||
return Op;
|
||||
unsigned IID = getIntrinsicID(Op0.getNode());
|
||||
// The below list of intrinsics isn't exhaustive. Add cases as-needed.
|
||||
// FIXME: Even better would be if there were an attribute on the node
|
||||
// that we could query and set in the intrinsics definition or something.
|
||||
unsigned SubIdx;
|
||||
switch (IID) {
|
||||
default:
|
||||
// Early exit if this isn't one of the intrinsics we handle.
|
||||
return Op;
|
||||
case Intrinsic::arm64_neon_uaddv:
|
||||
case Intrinsic::arm64_neon_saddv:
|
||||
case Intrinsic::arm64_neon_uaddlv:
|
||||
case Intrinsic::arm64_neon_saddlv:
|
||||
switch (Op0.getValueType().getSizeInBits()) {
|
||||
default:
|
||||
llvm_unreachable("Illegal result size from ARM64 vector intrinsic!");
|
||||
case 8:
|
||||
SubIdx = ARM64::bsub;
|
||||
break;
|
||||
case 16:
|
||||
SubIdx = ARM64::hsub;
|
||||
break;
|
||||
case 32:
|
||||
SubIdx = ARM64::ssub;
|
||||
break;
|
||||
case 64:
|
||||
SubIdx = ARM64::dsub;
|
||||
break;
|
||||
}
|
||||
}
|
||||
MachineSDNode *N =
|
||||
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(Op),
|
||||
Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
|
||||
Op0, DAG.getTargetConstant(SubIdx, MVT::i32));
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
EVT VT = Op.getOperand(0).getValueType();
|
||||
|
@ -1890,31 +1890,39 @@ defm FMOV : UnscaledConversion<"fmov">;
|
||||
def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
|
||||
def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
|
||||
|
||||
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
|
||||
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
|
||||
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
|
||||
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
|
||||
|
||||
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
|
||||
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
|
||||
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
|
||||
|
||||
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn,
|
||||
FPR32)>;
|
||||
def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), (COPY_TO_REGCLASS FPR32:$Xn,
|
||||
GPR32)>;
|
||||
def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), (COPY_TO_REGCLASS GPR64:$Xn,
|
||||
FPR64)>;
|
||||
def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), (COPY_TO_REGCLASS FPR64:$Xn,
|
||||
GPR64)>;
|
||||
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
|
||||
(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
|
||||
def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
|
||||
(COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
|
||||
def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
|
||||
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
|
||||
def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
|
||||
(COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Floating point conversion instruction.
|
||||
@ -2971,16 +2979,18 @@ def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
|
||||
defm INS : SIMDIns;
|
||||
|
||||
def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
|
||||
(INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
|
||||
def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
|
||||
(EXTRACT_SUBREG
|
||||
(INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
|
||||
|
||||
def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
|
||||
(INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
|
||||
def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
|
||||
(EXTRACT_SUBREG
|
||||
(INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
|
||||
(SUBREG_TO_REG (i32 0),
|
||||
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
|
||||
|
||||
def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
|
||||
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
|
||||
|
@ -136,6 +136,18 @@ entry:
|
||||
ret i64 %vaddv.i
|
||||
}
|
||||
|
||||
define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
|
||||
; CHECK-LABEL: test_vaddv_u64_to_vec:
|
||||
; CHECK: addp.2d d0, v0
|
||||
; CHECK-NOT: fmov
|
||||
; CHECK-NOT: ins
|
||||
; CHECK: ret
|
||||
entry:
|
||||
%vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
|
||||
%vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
|
||||
ret <1 x i64> %vec
|
||||
}
|
||||
|
||||
define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
|
||||
; CHECK-LABEL: test_vaddvq_s8:
|
||||
; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
|
||||
|
Loading…
Reference in New Issue
Block a user