llvm-6502/lib/MC/MCObjectDisassembler.cpp

//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "llvm/MC/MCObjectDisassembler.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCFunction.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCModule.h"
#include "llvm/Object/ObjectFile.h"
#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/StringRefMemoryObject.h"
#include "llvm/Support/raw_ostream.h"
#include <map>
#include <set>

using namespace llvm;
using namespace object;

MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
                                           const MCDisassembler &Dis,
                                           const MCInstrAnalysis &MIA)
  : Obj(Obj), Dis(Dis), MIA(MIA) {}

MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
  MCModule *Module = new MCModule;
  buildSectionAtoms(Module);
  if (withCFG)
    buildCFG(Module);
  return Module;
}

void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
  error_code ec;
  for (section_iterator SI = Obj.begin_sections(),
                        SE = Obj.end_sections();
                        SI != SE;
                        SI.increment(ec)) {
    if (ec) break;

    bool isText; SI->isText(isText);
    bool isData; SI->isData(isData);
    if (!isData && !isText)
      continue;

    uint64_t StartAddr; SI->getAddress(StartAddr);
    uint64_t SecSize; SI->getSize(SecSize);
    if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
      continue;

    StringRef Contents; SI->getContents(Contents);
    StringRefMemoryObject memoryObject(Contents);

    // We don't care about things like non-file-backed sections yet.
    if (Contents.size() != SecSize || !SecSize)
      continue;
    uint64_t EndAddr = StartAddr + SecSize - 1;

    StringRef SecName; SI->getName(SecName);

    if (isText) {
      MCTextAtom *Text = Module->createTextAtom(StartAddr, EndAddr);
      Text->setName(SecName);
      uint64_t InstSize;
      for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
        MCInst Inst;
        if (Dis.getInstruction(Inst, InstSize, memoryObject, Index,
                               nulls(), nulls()))
          Text->addInst(Inst, InstSize);
        else
          // We don't care about splitting mixed atoms either.
          llvm_unreachable("Couldn't disassemble instruction in atom.");
      }

    } else {
      MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
      Data->setName(SecName);
      for (uint64_t Index = 0; Index < SecSize; ++Index)
        Data->addData(Contents[Index]);
    }
  }
}

namespace {
  struct BBInfo;
  typedef std::set<BBInfo*> BBInfoSetTy;

  struct BBInfo {
    MCTextAtom *Atom;
    MCBasicBlock *BB;
    BBInfoSetTy Succs;
    BBInfoSetTy Preds;

    void addSucc(BBInfo &Succ) {
      Succs.insert(&Succ);
      Succ.Preds.insert(this);
    }
  };
}

void MCObjectDisassembler::buildCFG(MCModule *Module) {
  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
  BBInfoByAddrTy BBInfos;
  typedef std::set<uint64_t> AddressSetTy;
  AddressSetTy Splits;
  AddressSetTy Calls;

  assert(Module->func_begin() == Module->func_end()
         && "Module already has a CFG!");

  // First, determine the basic block boundaries and call targets.
  for (MCModule::atom_iterator AI = Module->atom_begin(),
                               AE = Module->atom_end();
       AI != AE; ++AI) {
    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
    if (!TA) continue;
    Calls.insert(TA->getBeginAddr());
    BBInfos[TA->getBeginAddr()].Atom = TA;
    for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
         II != IE; ++II) {
      if (MIA.isTerminator(II->Inst))
        Splits.insert(II->Address + II->Size);
      uint64_t Target;
      if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
        if (MIA.isCall(II->Inst))
          Calls.insert(Target);
        Splits.insert(Target);
      }
    }
  }

  // Split text atoms into basic block atoms.
  for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
       SI != SE; ++SI) {
    MCAtom *A = Module->findAtomContaining(*SI);
    if (!A) continue;
    MCTextAtom *TA = cast<MCTextAtom>(A);
    if (TA->getBeginAddr() == *SI)
      continue;
    MCTextAtom *NewAtom = TA->split(*SI);
    BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;
    StringRef BBName = TA->getName();
    BBName = BBName.substr(0, BBName.find_last_of(':'));
    NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());
  }

  // Compute succs/preds.
  for (MCModule::atom_iterator AI = Module->atom_begin(),
                               AE = Module->atom_end();
                               AI != AE; ++AI) {
    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
    if (!TA) continue;
    BBInfo &CurBB = BBInfos[TA->getBeginAddr()];
    const MCDecodedInst &LI = TA->back();
    if (MIA.isBranch(LI.Inst)) {
      uint64_t Target;
      if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))
        CurBB.addSucc(BBInfos[Target]);
      if (MIA.isConditionalBranch(LI.Inst))
        CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
    } else if (!MIA.isTerminator(LI.Inst))
      CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
  }


  // Create functions and basic blocks.
  for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();
       CI != CE; ++CI) {
    BBInfo &BBI = BBInfos[*CI];
    if (!BBI.Atom) continue;

    MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());

    // Create MCBBs.
    SmallSetVector<BBInfo*, 16> Worklist;
    Worklist.insert(&BBI);
    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
      BBInfo *BBI = Worklist[WI];
      if (!BBI->Atom)
        continue;
      BBI->BB = &MCFN.createBlock(*BBI->Atom);
      // Add all predecessors and successors to the worklist.
      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
                                 SI != SE; ++SI)
        Worklist.insert(*SI);
      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
                                 PI != PE; ++PI)
        Worklist.insert(*PI);
    }

    // Set preds/succs.
    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
      BBInfo *BBI = Worklist[WI];
      MCBasicBlock *MCBB = BBI->BB;
      if (!MCBB)
        continue;
      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
                                 SI != SE; ++SI)
        MCBB->addSuccessor((*SI)->BB);
      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
                                 PI != PE; ++PI)
        MCBB->addPredecessor((*PI)->BB);
    }
  }
}
MC: Disassembled CFG reconstruction. This patch builds on some existing code to do CFG reconstruction from a disassembled binary: - MCModule represents the binary, and has a list of MCAtoms. - MCAtom represents either disassembled instructions (MCTextAtom), or contiguous data (MCDataAtom), and covers a specific range of addresses. - MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is backed by an MCTextAtom, and has the usual successors/predecessors. - MCObjectDisassembler creates a module from an ObjectFile using a disassembler. It first builds an atom for each section. It can also construct the CFG, and this splits the text atoms into basic blocks. MCModule and MCAtom were only sketched out; MCFunction and MCBB were implemented under the experimental "-cfg" llvm-objdump -macho option. This cleans them up for further use; llvm-objdump -d -cfg now generates graphviz files for each function found in the binary. In the future, MCObjectDisassembler may be the right place to do "intelligent" disassembly: for example, handling constant islands is just a matter of splitting the atom, using information that may be available in the ObjectFile. Also, better initial atom formation than just using sections is possible using symbols (and things like Mach-O's function_starts load command). This brings two minor regressions in llvm-objdump -macho -cfg: - The printing of a relocation's referenced symbol. - An annotation on loop BBs, i.e., which are their own successor. Relocation printing is replaced by the MCSymbolizer; the basic CFG annotation will be superseded by more related functionality. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8 2013-05-24 01:07:04 +00:00			`//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "llvm/MC/MCObjectDisassembler.h"`
			`#include "llvm/ADT/STLExtras.h"`
			`#include "llvm/ADT/SetVector.h"`
			`#include "llvm/ADT/StringExtras.h"`
			`#include "llvm/ADT/StringRef.h"`
			`#include "llvm/ADT/Twine.h"`
			`#include "llvm/MC/MCAtom.h"`
			`#include "llvm/MC/MCDisassembler.h"`
			`#include "llvm/MC/MCFunction.h"`
			`#include "llvm/MC/MCInstrAnalysis.h"`
			`#include "llvm/MC/MCModule.h"`
			`#include "llvm/Object/ObjectFile.h"`
			`#include "llvm/Support/MemoryObject.h"`
			`#include "llvm/Support/StringRefMemoryObject.h"`
			`#include "llvm/Support/raw_ostream.h"`
			`#include <map>`
			`#include <set>`

			`using namespace llvm;`
			`using namespace object;`

			`MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,`
			`const MCDisassembler &Dis,`
			`const MCInstrAnalysis &MIA)`
			`: Obj(Obj), Dis(Dis), MIA(MIA) {}`

			`MCModule *MCObjectDisassembler::buildModule(bool withCFG) {`
			`MCModule *Module = new MCModule;`
			`buildSectionAtoms(Module);`
			`if (withCFG)`
			`buildCFG(Module);`
			`return Module;`
			`}`

			`void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {`
			`error_code ec;`
			`for (section_iterator SI = Obj.begin_sections(),`
			`SE = Obj.end_sections();`
			`SI != SE;`
			`SI.increment(ec)) {`
			`if (ec) break;`

			`bool isText; SI->isText(isText);`
			`bool isData; SI->isData(isData);`
			`if (!isData && !isText)`
			`continue;`

			`uint64_t StartAddr; SI->getAddress(StartAddr);`
			`uint64_t SecSize; SI->getSize(SecSize);`
			`if (StartAddr == UnknownAddressOrSize \|\| SecSize == UnknownAddressOrSize)`
			`continue;`

			`StringRef Contents; SI->getContents(Contents);`
			`StringRefMemoryObject memoryObject(Contents);`

			`// We don't care about things like non-file-backed sections yet.`
			`if (Contents.size() != SecSize \|\| !SecSize)`
			`continue;`
			`uint64_t EndAddr = StartAddr + SecSize - 1;`

			`StringRef SecName; SI->getName(SecName);`

			`if (isText) {`
			`MCTextAtom *Text = Module->createTextAtom(StartAddr, EndAddr);`
			`Text->setName(SecName);`
			`uint64_t InstSize;`
			`for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {`
			`MCInst Inst;`
			`if (Dis.getInstruction(Inst, InstSize, memoryObject, Index,`
			`nulls(), nulls()))`
			`Text->addInst(Inst, InstSize);`
			`else`
			`// We don't care about splitting mixed atoms either.`
			`llvm_unreachable("Couldn't disassemble instruction in atom.");`
			`}`

			`} else {`
			`MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);`
			`Data->setName(SecName);`
			`for (uint64_t Index = 0; Index < SecSize; ++Index)`
			`Data->addData(Contents[Index]);`
			`}`
			`}`
			`}`

			`namespace {`
			`struct BBInfo;`
			`typedef std::set<BBInfo*> BBInfoSetTy;`

			`struct BBInfo {`
			`MCTextAtom *Atom;`
			`MCBasicBlock *BB;`
			`BBInfoSetTy Succs;`
			`BBInfoSetTy Preds;`

			`void addSucc(BBInfo &Succ) {`
			`Succs.insert(&Succ);`
			`Succ.Preds.insert(this);`
			`}`
			`};`
			`}`

			`void MCObjectDisassembler::buildCFG(MCModule *Module) {`
			`typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;`
			`BBInfoByAddrTy BBInfos;`
			`typedef std::set<uint64_t> AddressSetTy;`
			`AddressSetTy Splits;`
			`AddressSetTy Calls;`

			`assert(Module->func_begin() == Module->func_end()`
			`&& "Module already has a CFG!");`

			`// First, determine the basic block boundaries and call targets.`
			`for (MCModule::atom_iterator AI = Module->atom_begin(),`
			`AE = Module->atom_end();`
			`AI != AE; ++AI) {`
			`MCTextAtom TA = dyn_cast<MCTextAtom>(AI);`
			`if (!TA) continue;`
			`Calls.insert(TA->getBeginAddr());`
Allow creation of single-byte MCAtoms. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@184344 91177308-0d34-0410-b5e6-96231b3b80d8 2013-06-19 20:18:59 +00:00			`BBInfos[TA->getBeginAddr()].Atom = TA;`
MC: Disassembled CFG reconstruction. This patch builds on some existing code to do CFG reconstruction from a disassembled binary: - MCModule represents the binary, and has a list of MCAtoms. - MCAtom represents either disassembled instructions (MCTextAtom), or contiguous data (MCDataAtom), and covers a specific range of addresses. - MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is backed by an MCTextAtom, and has the usual successors/predecessors. - MCObjectDisassembler creates a module from an ObjectFile using a disassembler. It first builds an atom for each section. It can also construct the CFG, and this splits the text atoms into basic blocks. MCModule and MCAtom were only sketched out; MCFunction and MCBB were implemented under the experimental "-cfg" llvm-objdump -macho option. This cleans them up for further use; llvm-objdump -d -cfg now generates graphviz files for each function found in the binary. In the future, MCObjectDisassembler may be the right place to do "intelligent" disassembly: for example, handling constant islands is just a matter of splitting the atom, using information that may be available in the ObjectFile. Also, better initial atom formation than just using sections is possible using symbols (and things like Mach-O's function_starts load command). This brings two minor regressions in llvm-objdump -macho -cfg: - The printing of a relocation's referenced symbol. - An annotation on loop BBs, i.e., which are their own successor. Relocation printing is replaced by the MCSymbolizer; the basic CFG annotation will be superseded by more related functionality. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8 2013-05-24 01:07:04 +00:00			`for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();`
			`II != IE; ++II) {`
			`if (MIA.isTerminator(II->Inst))`
			`Splits.insert(II->Address + II->Size);`
			`uint64_t Target;`
			`if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {`
			`if (MIA.isCall(II->Inst))`
			`Calls.insert(Target);`
			`Splits.insert(Target);`
			`}`
			`}`
			`}`

			`// Split text atoms into basic block atoms.`
			`for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();`
			`SI != SE; ++SI) {`
			`MCAtom A = Module->findAtomContaining(SI);`
			`if (!A) continue;`
			`MCTextAtom *TA = cast<MCTextAtom>(A);`
			`if (TA->getBeginAddr() == *SI)`
			`continue;`
			`MCTextAtom NewAtom = TA->split(SI);`
			`BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;`
			`StringRef BBName = TA->getName();`
			`BBName = BBName.substr(0, BBName.find_last_of(':'));`
			`NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());`
			`}`

			`// Compute succs/preds.`
			`for (MCModule::atom_iterator AI = Module->atom_begin(),`
			`AE = Module->atom_end();`
			`AI != AE; ++AI) {`
			`MCTextAtom TA = dyn_cast<MCTextAtom>(AI);`
			`if (!TA) continue;`
			`BBInfo &CurBB = BBInfos[TA->getBeginAddr()];`
			`const MCDecodedInst &LI = TA->back();`
			`if (MIA.isBranch(LI.Inst)) {`
			`uint64_t Target;`
			`if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))`
			`CurBB.addSucc(BBInfos[Target]);`
			`if (MIA.isConditionalBranch(LI.Inst))`
			`CurBB.addSucc(BBInfos[LI.Address + LI.Size]);`
			`} else if (!MIA.isTerminator(LI.Inst))`
			`CurBB.addSucc(BBInfos[LI.Address + LI.Size]);`
			`}`


			`// Create functions and basic blocks.`
			`for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();`
			`CI != CE; ++CI) {`
			`BBInfo &BBI = BBInfos[*CI];`
			`if (!BBI.Atom) continue;`

			`MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());`

			`// Create MCBBs.`
			`SmallSetVector<BBInfo*, 16> Worklist;`
			`Worklist.insert(&BBI);`
			`for (size_t WI = 0; WI < Worklist.size(); ++WI) {`
			`BBInfo *BBI = Worklist[WI];`
			`if (!BBI->Atom)`
			`continue;`
			`BBI->BB = &MCFN.createBlock(*BBI->Atom);`
			`// Add all predecessors and successors to the worklist.`
			`for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();`
			`SI != SE; ++SI)`
			`Worklist.insert(*SI);`
			`for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();`
			`PI != PE; ++PI)`
			`Worklist.insert(*PI);`
			`}`

			`// Set preds/succs.`
			`for (size_t WI = 0; WI < Worklist.size(); ++WI) {`
			`BBInfo *BBI = Worklist[WI];`
			`MCBasicBlock *MCBB = BBI->BB;`
			`if (!MCBB)`
			`continue;`
			`for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();`
			`SI != SE; ++SI)`
			`MCBB->addSuccessor((*SI)->BB);`
			`for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();`
			`PI != PE; ++PI)`
			`MCBB->addPredecessor((*PI)->BB);`
			`}`
			`}`
			`}`