llvm-6502/lib/Target/ARM/ARMTargetMachine.cpp
Evan Cheng 48575f6ea7 Making use of VFP / NEON floating point multiply-accumulate / subtraction is
difficult on current ARM implementations for a few reasons.
1. Even though a single vmla has latency that is one cycle shorter than a pair
   of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause
   additional pipeline stall. So it's frequently better to single codegen
   vmul + vadd.
2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to
   stall for 4 cycles. We need to schedule them apart.
3. A vmla followed vmla is a special case. Obvious issuing back to back RAW
   vmla + vmla is very bad. But this isn't ideal either:
     vmul
     vadd
     vmla
   Instead, we want to expand the second vmla:
     vmla
     vmul
     vadd
   Even with the 4 cycle vmul stall, the second sequence is still 2 cycles
   faster.

Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough
but it isn't the optimial solution. This patch attempts to make it possible to
use vmla / vmls in cases where it is profitable.

A. Add missing isel predicates which cause vmla to be codegen'ed.
B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to
   compute a fmul and a fmla.
C. Add additional isel checks for vmla, avoid cases where vmla is feeding into
   fp instructions (except for the #3 exceptional case).
D. Add ARM hazard recognizer to model the vmla / vmls hazards.
E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the
   vmla / vmls will trigger one of the special hazards.

Work in progress, only A+B are enabled.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8
2010-12-05 22:04:16 +00:00

203 lines
7.1 KiB
C++

//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#include "ARMTargetMachine.h"
#include "ARMMCAsmInfo.h"
#include "ARMFrameInfo.h"
#include "ARM.h"
#include "llvm/PassManager.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Target/TargetRegistry.h"
using namespace llvm;
static cl::opt<bool>ExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
Triple TheTriple(TT);
switch (TheTriple.getOS()) {
case Triple::Darwin:
return new ARMMCAsmInfoDarwin();
default:
return new ARMELFMCAsmInfo();
}
}
// This is duplicated code. Refactor this.
static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
MCContext &Ctx, TargetAsmBackend &TAB,
raw_ostream &OS,
MCCodeEmitter *Emitter,
bool RelaxAll) {
switch (Triple(TT).getOS()) {
case Triple::Darwin:
return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
case Triple::MinGW32:
case Triple::MinGW64:
case Triple::Cygwin:
case Triple::Win32:
llvm_unreachable("ARM does not support Windows COFF format");
return NULL;
default:
return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
}
}
extern "C" void LLVMInitializeARMTarget() {
// Register the target.
RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
// Register the target asm info.
RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo);
RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo);
// Register the MC Code Emitter
TargetRegistry::RegisterCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
TargetRegistry::RegisterCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
// Register the asm backend.
TargetRegistry::RegisterAsmBackend(TheARMTarget, createARMAsmBackend);
TargetRegistry::RegisterAsmBackend(TheThumbTarget, createARMAsmBackend);
// Register the object streamer.
TargetRegistry::RegisterObjectStreamer(TheARMTarget, createMCStreamer);
TargetRegistry::RegisterObjectStreamer(TheThumbTarget, createMCStreamer);
}
/// TargetMachine ctor - Create an ARM architecture model.
///
ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
const std::string &TT,
const std::string &FS,
bool isThumb)
: LLVMTargetMachine(T, TT),
Subtarget(TT, FS, isThumb),
JITInfo(),
InstrItins(Subtarget.getInstrItineraryData())
{
DefRelocModel = getRelocationModel();
}
ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
const std::string &FS)
: ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
DataLayout(Subtarget.isAPCS_ABI() ?
std::string("e-p:32:32-f64:32:64-i64:32:64-"
"v128:32:128-v64:32:64-n32") :
std::string("e-p:32:32-f64:64:64-i64:64:64-"
"v128:64:128-v64:64:64-n32")),
ELFWriterInfo(*this),
TLInfo(*this),
TSInfo(*this),
FrameInfo(Subtarget) {
if (!Subtarget.hasARMOps())
report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
"support ARM mode execution!");
}
ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
const std::string &FS)
: ARMBaseTargetMachine(T, TT, FS, true),
InstrInfo(Subtarget.hasThumb2()
? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
: ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
DataLayout(Subtarget.isAPCS_ABI() ?
std::string("e-p:32:32-f64:32:64-i64:32:64-"
"i16:16:32-i8:8:32-i1:8:32-"
"v128:32:128-v64:32:64-a:0:32-n32") :
std::string("e-p:32:32-f64:64:64-i64:64:64-"
"i16:16:32-i8:8:32-i1:8:32-"
"v128:64:128-v64:64:64-a:0:32-n32")),
ELFWriterInfo(*this),
TLInfo(*this),
TSInfo(*this),
FrameInfo(Subtarget.hasThumb2()
? new ARMFrameInfo(Subtarget)
: (ARMFrameInfo*)new Thumb1FrameInfo(Subtarget)) {
}
// Pass Pipeline Configuration
bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
if (OptLevel != CodeGenOpt::None)
PM.add(createARMGlobalMergePass(getTargetLowering()));
return false;
}
bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
PM.add(createARMISelDag(*this, OptLevel));
return false;
}
bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
// FIXME: temporarily disabling load / store optimization pass for Thumb1.
if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
PM.add(createARMLoadStoreOptimizationPass(true));
if (ExpandMLx &&
OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
PM.add(createMLxExpansionPass());
return true;
}
bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
// FIXME: temporarily disabling load / store optimization pass for Thumb1.
if (OptLevel != CodeGenOpt::None) {
if (!Subtarget.isThumb1Only())
PM.add(createARMLoadStoreOptimizationPass());
if (Subtarget.hasNEON())
PM.add(createNEONMoveFixPass());
}
// Expand some pseudo instructions into multiple instructions to allow
// proper scheduling.
PM.add(createARMExpandPseudoPass());
if (OptLevel != CodeGenOpt::None) {
if (!Subtarget.isThumb1Only())
PM.add(createIfConverterPass());
}
if (Subtarget.isThumb2())
PM.add(createThumb2ITBlockPass());
return true;
}
bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
CodeGenOpt::Level OptLevel) {
if (Subtarget.isThumb2() && !Subtarget.prefers32BitThumb())
PM.add(createThumb2SizeReductionPass());
PM.add(createARMConstantIslandPass());
return true;
}
bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
CodeGenOpt::Level OptLevel,
JITCodeEmitter &JCE) {
// FIXME: Move this to TargetJITInfo!
if (DefRelocModel == Reloc::Default)
setRelocationModel(Reloc::Static);
// Machine code emitter pass for ARM.
PM.add(createARMJITCodeEmitterPass(*this, JCE));
return false;
}