diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index be6bec75928..d4caf5ca6e1 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -46,6 +46,10 @@ public: virtual ARMJITInfo *getJITInfo() { return &JITInfo; } virtual const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; } + virtual const ARMTargetLowering *getTargetLowering() const { + // Implemented by derived classes + llvm_unreachable("getTargetLowering not implemented"); + } virtual const InstrItineraryData *getInstrItineraryData() const { return &InstrItins; } diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 404a6fff117..61cb1f6b9a3 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetLowering.h" +#include "llvm/Target/CostTable.h" using namespace llvm; // Declare the pass initialization routine locally as target-specific passes @@ -34,18 +35,20 @@ namespace { class ARMTTI : public ImmutablePass, public TargetTransformInfo { const ARMBaseTargetMachine *TM; const ARMSubtarget *ST; + const ARMTargetLowering *TLI; /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the result needs to be inserted and/or extracted from vectors. unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; public: - ARMTTI() : ImmutablePass(ID), TM(0), ST(0) { + ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { llvm_unreachable("This pass cannot be directly constructed"); } ARMTTI(const ARMBaseTargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()) { + : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { initializeARMTTIPass(*PassRegistry::getPassRegistry()); } @@ -111,6 +114,9 @@ public: return 1; } + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const; + /// @} }; @@ -157,3 +163,37 @@ unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { } return 2; } + +unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + EVT SrcTy = TLI->getValueType(Src); + EVT DstTy = TLI->getValueType(Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + + // Some arithmetic, load and store operations have specific instructions + // to cast up/down their types automatically at no extra cost + // TODO: Get these tables to know at least what the related operations are + static const TypeConversionCostTblEntry NEONConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + }; + + if (ST->hasNEON()) { + int Idx = ConvertCostTableLookup(NEONConversionTbl, + array_lengthof(NEONConversionTbl), + ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + if (Idx != -1) + return NEONConversionTbl[Idx].Cost; + } + + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); +} diff --git a/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll new file mode 100644 index 00000000000..d2e3de279f7 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s +; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s +; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM + +; ModuleID = 'arm.ll' +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" +target triple = "armv7--linux-gnueabihf" + +%T216 = type <2 x i16> +%T232 = type <2 x i32> +%T264 = type <2 x i64> + +%T416 = type <4 x i16> +%T432 = type <4 x i32> +%T464 = type <4 x i64> + +define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) { +; COST: function 'direct': + %v0 = load %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmul.i32 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'ups1632': + %v0 = load %T416* %loadaddr +; ASM: vldr + %v1 = load %T416* %loadaddr2 +; ASM: vldr + %r1 = sext %T416 %v0 to %T432 + %r2 = sext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.s16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'upu1632': + %v0 = load %T416* %loadaddr +; ASM: vldr + %v1 = load %T416* %loadaddr2 +; ASM: vldr + %r1 = zext %T416 %v0 to %T432 + %r2 = zext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.u16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'ups3264': + %v0 = load %T232* %loadaddr +; ASM: vldr + %v1 = load %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = sext %T232 %r3 to %T264 +; ASM: vmovl.s32 +; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'upu3264': + %v0 = load %T232* %loadaddr +; ASM: vldr + %v1 = load %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = zext %T232 %r3 to %T264 +; ASM: vmovl.u32 +; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) { +; COST: function 'dn3216': + %v0 = load %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> + %st = trunc %T432 %r3 to %T416 +; ASM: vmovn.i32 +; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16> + store %T416 %st, %T416* %storeaddr +; ASM: vstr + ret void +}