MC: Disassembled CFG reconstruction.

This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
  contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
  backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
  disassembler. It first builds an atom for each section. It can also
  construct the CFG, and this splits the text atoms into basic blocks.

MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.

In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).

This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.

Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.

git-svn-id: 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Ahmed Bougacha 2013-05-24 01:07:04 +00:00
parent 2c94d0faa0
commit ef99356dfe
22 changed files with 1049 additions and 685 deletions

View File

@ -1,4 +1,4 @@
//===-- llvm/MC/MCAtom.h - MCAtom class ---------------------*- C++ -*-===//
//===-- llvm/MC/MCAtom.h ----------------------------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
@ -9,7 +9,7 @@
// This file contains the declaration of the MCAtom class, which is used to
// represent a contiguous region in a decoded object that is uniformly data or
// instructions;
// instructions.
@ -24,45 +24,169 @@ namespace llvm {
class MCModule;
/// MCData - An entry in a data MCAtom.
// NOTE: This may change to a more complex type in the future.
typedef uint8_t MCData;
class MCAtom;
class MCTextAtom;
class MCDataAtom;
/// MCAtom - Represents a contiguous range of either instructions (a TextAtom)
/// or data (a DataAtom). Address ranges are expressed as _closed_ intervals.
class MCAtom {
friend class MCModule;
typedef enum { TextAtom, DataAtom } AtomType;
virtual ~MCAtom() {}
AtomType Type;
enum AtomKind { TextAtom, DataAtom };
AtomKind getKind() const { return Kind; }
/// \brief Get the start address of the atom.
uint64_t getBeginAddr() const { return Begin; }
/// \brief Get the end address, i.e. the last one inside the atom.
uint64_t getEndAddr() const { return End; }
/// \name Atom modification methods:
/// When modifying a TextAtom, keep instruction boundaries in mind.
/// For instance, split must me given the start address of an instruction.
/// @{
/// \brief Splits the atom in two at a given address.
/// \param SplitPt Address at which to start a new atom, splitting this one.
/// \returns The newly created atom starting at \p SplitPt.
virtual MCAtom *split(uint64_t SplitPt) = 0;
/// \brief Truncates an atom, discarding everything after \p TruncPt.
/// \param TruncPt Last byte address to be contained in this atom.
virtual void truncate(uint64_t TruncPt) = 0;
/// @}
/// \name Naming:
/// This is mostly for display purposes, and may contain anything that hints
/// at what the atom contains: section or symbol name, BB start address, ..
/// @{
StringRef getName() const { return Name; }
void setName(StringRef NewName) { Name = NewName.str(); }
/// @}
const AtomKind Kind;
std::string Name;
MCModule *Parent;
uint64_t Begin, End;
std::vector<std::pair<uint64_t, MCInst> > Text;
friend class MCModule;
MCAtom(AtomKind K, MCModule *P, uint64_t B, uint64_t E)
: Kind(K), Name("(unknown)"), Parent(P), Begin(B), End(E) { }
/// \name Atom remapping helpers
/// @{
/// \brief Remap the atom, using the given range, updating Begin/End.
/// One or both of the bounds can remain the same, but overlapping with other
/// atoms in the module is still forbidden.
void remap(uint64_t NewBegin, uint64_t NewEnd);
/// \brief Remap the atom to prepare for a truncation at TruncPt.
/// Equivalent to:
/// \code
/// // Bound checks
/// remap(Begin, TruncPt);
/// \endcode
void remapForTruncate(uint64_t TruncPt);
/// \brief Remap the atom to prepare for a split at SplitPt.
/// The bounds for the resulting atoms are returned in {L,R}{Begin,End}.
/// The current atom is truncated to \p LEnd.
void remapForSplit(uint64_t SplitPt,
uint64_t &LBegin, uint64_t &LEnd,
uint64_t &RBegin, uint64_t &REnd);
/// @}
/// \name Text atom
/// @{
/// \brief An entry in an MCTextAtom: a disassembled instruction.
/// NOTE: Both the Address and Size field are actually redundant when taken in
/// the context of the text atom, and may better be exposed in an iterator
/// instead of stored in the atom, which would replace this class.
class MCDecodedInst {
MCInst Inst;
uint64_t Address;
uint64_t Size;
MCDecodedInst(const MCInst &Inst, uint64_t Address, uint64_t Size)
: Inst(Inst), Address(Address), Size(Size) {}
/// \brief An atom consisting of disassembled instructions.
class MCTextAtom : public MCAtom {
typedef std::vector<MCDecodedInst> InstListTy;
InstListTy Insts;
/// \brief The address of the next appended instruction, i.e., the
/// address immediately after the last instruction in the atom.
uint64_t NextInstAddress;
/// Append an instruction, expanding the atom if necessary.
void addInst(const MCInst &Inst, uint64_t Size);
/// \name Instruction list access
/// @{
typedef InstListTy::const_iterator const_iterator;
const_iterator begin() const { return Insts.begin(); }
const_iterator end() const { return Insts.end(); }
const MCDecodedInst &back() const { return Insts.back(); }
const MCDecodedInst &at(size_t n) const { return; }
uint64_t size() const { return Insts.size(); }
/// @}
/// \name Atom type specific split/truncate logic.
/// @{
MCTextAtom *split(uint64_t SplitPt) LLVM_OVERRIDE;
void truncate(uint64_t TruncPt) LLVM_OVERRIDE;
/// @}
// Class hierarchy.
static bool classof(const MCAtom *A) { return A->getKind() == TextAtom; }
friend class MCModule;
// Private constructor - only callable by MCModule
MCTextAtom(MCModule *P, uint64_t Begin, uint64_t End)
: MCAtom(TextAtom, P, Begin, End), NextInstAddress(Begin) {}
/// @}
/// \name Data atom
/// @{
/// \brief An entry in an MCDataAtom.
// NOTE: This may change to a more complex type in the future.
typedef uint8_t MCData;
/// \brief An atom consising of a sequence of bytes.
class MCDataAtom : public MCAtom {
std::vector<MCData> Data;
// Private constructor - only callable by MCModule
MCAtom(AtomType T, MCModule *P, uint64_t B, uint64_t E)
: Type(T), Parent(P), Begin(B), End(E) { }
bool isTextAtom() const { return Type == TextAtom; }
bool isDataAtom() const { return Type == DataAtom; }
void addInst(const MCInst &I, uint64_t Address, unsigned Size);
/// Append a data entry, expanding the atom if necessary.
void addData(const MCData &D);
/// split - Splits the atom in two at a given address, which must align with
/// and instruction boundary if this is a TextAtom. Returns the newly created
/// atom representing the high part of the split.
MCAtom *split(uint64_t SplitPt);
/// \name Atom type specific split/truncate logic.
/// @{
MCDataAtom *split(uint64_t SplitPt) LLVM_OVERRIDE;
void truncate(uint64_t TruncPt) LLVM_OVERRIDE;
/// @}
/// truncate - Truncates an atom so that TruncPt is the last byte address
/// contained in the atom.
void truncate(uint64_t TruncPt);
// Class hierarchy.
static bool classof(const MCAtom *A) { return A->getKind() == DataAtom; }
friend class MCModule;
// Private constructor - only callable by MCModule
MCDataAtom(MCModule *P, uint64_t Begin, uint64_t End)
: MCAtom(DataAtom, P, Begin, End), Data(End - Begin) {}

View File

@ -0,0 +1,122 @@
//===-- llvm/MC/MCFunction.h ------------------------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file defines the data structures to hold a CFG reconstructed from
// machine code.
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInst.h"
#include <string>
#include <vector>
namespace llvm {
class MCFunction;
class MCModule;
class MCTextAtom;
/// \brief Basic block containing a sequence of disassembled instructions.
/// The basic block is backed by an MCTextAtom, which holds the instructions,
/// and the address range it covers.
/// Create a basic block using MCFunction::createBlock.
class MCBasicBlock {
const MCTextAtom *Insts;
// MCFunction owns the basic block.
MCFunction *Parent;
friend class MCFunction;
MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent);
/// \name Predecessors/Successors, to represent the CFG.
/// @{
typedef std::vector<const MCBasicBlock *> BasicBlockListTy;
BasicBlockListTy Successors;
BasicBlockListTy Predecessors;
/// @}
/// \brief Get the backing MCTextAtom, containing the instruction sequence.
const MCTextAtom *getInsts() const { return Insts; }
/// \name Get the owning MCFunction.
/// @{
const MCFunction *getParent() const { return Parent; }
MCFunction *getParent() { return Parent; }
/// @}
/// MC CFG access: Predecessors/Successors.
/// @{
typedef BasicBlockListTy::const_iterator succ_const_iterator;
succ_const_iterator succ_begin() const { return Successors.begin(); }
succ_const_iterator succ_end() const { return Successors.end(); }
typedef BasicBlockListTy::const_iterator pred_const_iterator;
pred_const_iterator pred_begin() const { return Predecessors.begin(); }
pred_const_iterator pred_end() const { return Predecessors.end(); }
void addSuccessor(const MCBasicBlock *MCBB);
bool isSuccessor(const MCBasicBlock *MCBB) const;
void addPredecessor(const MCBasicBlock *MCBB);
bool isPredecessor(const MCBasicBlock *MCBB) const;
/// @}
/// \brief Represents a function in machine code, containing MCBasicBlocks.
/// MCFunctions are created using MCModule::createFunction.
class MCFunction {
MCFunction (const MCFunction&) LLVM_DELETED_FUNCTION;
MCFunction& operator=(const MCFunction&) LLVM_DELETED_FUNCTION;
std::string Name;
typedef std::vector<MCBasicBlock*> BasicBlockListTy;
BasicBlockListTy Blocks;
// MCModule owns the function.
friend class MCModule;
MCFunction(StringRef Name);
/// \brief Create an MCBasicBlock backed by Insts and add it to this function.
/// \param Insts Sequence of straight-line code backing the basic block.
/// \returns The newly created basic block.
MCBasicBlock &createBlock(const MCTextAtom &Insts);
StringRef getName() const { return Name; }
/// \name Access to the function's basic blocks. No ordering is enforced.
/// @{
/// \brief Get the entry point basic block.
const MCBasicBlock *getEntryBlock() const { return front(); }
MCBasicBlock *getEntryBlock() { return front(); }
// NOTE: Dereferencing iterators gives pointers, so maybe a list is best here.
typedef BasicBlockListTy::const_iterator const_iterator;
typedef BasicBlockListTy:: iterator iterator;
const_iterator begin() const { return Blocks.begin(); }
iterator begin() { return Blocks.begin(); }
const_iterator end() const { return Blocks.end(); }
iterator end() { return Blocks.end(); }
const MCBasicBlock* front() const { return Blocks.front(); }
MCBasicBlock* front() { return Blocks.front(); }
const MCBasicBlock* back() const { return Blocks.back(); }
MCBasicBlock* back() { return Blocks.back(); }
/// @}

View File

@ -52,10 +52,15 @@ public:
return Info->get(Inst.getOpcode()).isReturn();
virtual bool isTerminator(const MCInst &Inst) const {
return Info->get(Inst.getOpcode()).isTerminator();
/// evaluateBranch - Given a branch instruction try to get the address the
/// branch targets. Otherwise return -1.
virtual uint64_t
evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size) const;
/// branch targets. Return true on success, and the address in Target.
virtual bool
evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
uint64_t &Target) const;

View File

@ -15,44 +15,93 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/IntervalMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/DataTypes.h"
#include <vector>
namespace llvm {
class MCAtom;
class MCDataAtom;
class MCFunction;
class MCObjectDisassembler;
class MCTextAtom;
/// MCModule - This class represent a completely disassembled object file or
/// executable. It comprises a list of MCAtom's, and a branch target table.
/// Each atom represents a contiguous range of either instructions or data.
/// \brief A completely disassembled object file or executable.
/// It comprises a list of MCAtom's, each representing a contiguous range of
/// either instructions or data.
/// An MCModule is created using MCObjectDisassembler::buildModule.
class MCModule {
/// AtomAllocationTracker - An MCModule owns its component MCAtom's, so it
/// must track them in order to ensure they are properly freed as atoms are
/// merged or otherwise manipulated.
SmallPtrSet<MCAtom*, 8> AtomAllocationTracker;
/// \name Atom tracking
/// @{
/// OffsetMap - Efficiently maps offset ranges to MCAtom's.
IntervalMap<uint64_t, MCAtom*> OffsetMap;
/// BranchTargetMap - Maps offsets that are determined to be branches and
/// can be statically resolved to their target offsets.
DenseMap<uint64_t, MCAtom*> BranchTargetMap;
/// \brief Atoms in this module, sorted by begin address.
/// FIXME: This doesn't handle overlapping atoms (which happen when a basic
/// block starts in the middle of an instruction of another basic block.)
typedef std::vector<MCAtom*> AtomListTy;
AtomListTy Atoms;
friend class MCAtom;
/// remap - Update the interval mapping for an MCAtom.
/// \brief Remap \p Atom to the given range, and update its Begin/End fields.
/// \param Atom An atom belonging to this module.
/// An atom should always use this method to update its bounds, because this
/// enables the owning MCModule to keep track of its atoms.
void remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd);
MCModule(IntervalMap<uint64_t, MCAtom*>::Allocator &A) : OffsetMap(A) { }
/// \brief Insert an atom in the module, using its Begin and End addresses.
void map(MCAtom *NewAtom);
/// @}
/// createAtom - Creates a new MCAtom covering the specified offset range.
MCAtom *createAtom(MCAtom::AtomType Type, uint64_t Begin, uint64_t End);
/// \name Function tracking
/// @{
typedef std::vector<MCFunction*> FunctionListTy;
FunctionListTy Functions;
/// @}
MCModule (const MCModule &) LLVM_DELETED_FUNCTION;
MCModule& operator=(const MCModule &) LLVM_DELETED_FUNCTION;
// MCObjectDisassembler creates MCModules.
friend class MCObjectDisassembler;
MCModule() : Atoms() { }
/// \name Create a new MCAtom covering the specified offset range.
/// @{
MCTextAtom *createTextAtom(uint64_t Begin, uint64_t End);
MCDataAtom *createDataAtom(uint64_t Begin, uint64_t End);
/// @}
/// \name Access to the owned atom list, ordered by begin address.
/// @{
const MCAtom *findAtomContaining(uint64_t Addr) const;
MCAtom *findAtomContaining(uint64_t Addr);
typedef AtomListTy::const_iterator const_atom_iterator;
typedef AtomListTy:: iterator atom_iterator;
const_atom_iterator atom_begin() const { return Atoms.begin(); }
atom_iterator atom_begin() { return Atoms.begin(); }
const_atom_iterator atom_end() const { return Atoms.end(); }
atom_iterator atom_end() { return Atoms.end(); }
/// @}
/// \name Create a new MCFunction.
MCFunction *createFunction(const StringRef &Name);
/// \name Access to the owned function list.
/// @{
typedef FunctionListTy::const_iterator const_func_iterator;
typedef FunctionListTy:: iterator func_iterator;
const_func_iterator func_begin() const { return Functions.begin(); }
func_iterator func_begin() { return Functions.begin(); }
const_func_iterator func_end() const { return Functions.end(); }
func_iterator func_end() { return Functions.end(); }
/// @}

View File

@ -0,0 +1,69 @@
//===-- llvm/MC/MCObjectDisassembler.h --------------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file contains the declaration of the MCObjectDisassembler class, which
// can be used to construct an MCModule and an MC CFG from an ObjectFile.
namespace llvm {
namespace object {
class ObjectFile;
class MCBasicBlock;
class MCDisassembler;
class MCFunction;
class MCInstrAnalysis;
class MCModule;
/// \brief Disassemble an ObjectFile to an MCModule and MCFunctions.
/// This class builds on MCDisassembler to disassemble whole sections, creating
/// MCAtom (MCTextAtom for disassembled sections and MCDataAtom for raw data).
/// It can also be used to create a control flow graph consisting of MCFunctions
/// and MCBasicBlocks.
class MCObjectDisassembler {
const object::ObjectFile &Obj;
const MCDisassembler &Dis;
const MCInstrAnalysis &MIA;
MCObjectDisassembler(const object::ObjectFile &Obj,
const MCDisassembler &Dis,
const MCInstrAnalysis &MIA);
/// \brief Build an MCModule, creating atoms and optionally functions.
/// \param withCFG Also build a CFG by adding MCFunctions to the Module.
/// If withCFG is false, the MCModule built only contains atoms, representing
/// what was found in the object file. If withCFG is true, MCFunctions are
/// created, containing MCBasicBlocks. All text atoms are split to form basic
/// block atoms, which then each back an MCBasicBlock.
MCModule *buildModule(bool withCFG = false);
/// \brief Fill \p Module by creating an atom for each section.
/// This could be made much smarter, using information like symbols, but also
/// format-specific features, like mach-o function_start or data_in_code LCs.
void buildSectionAtoms(MCModule *Module);
/// \brief Enrich \p Module with a CFG consisting of MCFunctions.
/// \param Module An MCModule returned by buildModule, with no CFG.
/// NOTE: Each MCBasicBlock in a MCFunction is backed by a single MCTextAtom.
/// When the CFG is built, contiguous instructions that were previously in a
/// single MCTextAtom will be split in multiple basic block atoms.
void buildCFG(MCModule *Module);

View File

@ -0,0 +1,42 @@
//===- llvm/Support/StringRefMemoryObject.h ---------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file contains the declaration of the StringRefMemObject class, a simple
// wrapper around StringRef implementing the MemoryObject interface.
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/MemoryObject.h"
namespace llvm {
/// StringRefMemoryObject - Simple StringRef-backed MemoryObject
class StringRefMemoryObject : public MemoryObject {
StringRef Bytes;
uint64_t Base;
StringRefMemoryObject(StringRef Bytes, uint64_t Base = 0)
: Bytes(Bytes), Base(Base) {}
uint64_t getBase() const { return Base; }
uint64_t getExtent() const { return Bytes.size(); }
int readByte(uint64_t Addr, uint8_t *Byte) const;
int readBytes(uint64_t Addr, uint64_t Size,
uint8_t *Buf, uint64_t *Copied) const;

View File

@ -15,6 +15,7 @@ add_llvm_library(LLVMMC
@ -26,6 +27,7 @@ add_llvm_library(LLVMMC

View File

@ -10,88 +10,101 @@
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCModule.h"
#include "llvm/Support/ErrorHandling.h"
#include <iterator>
using namespace llvm;
void MCAtom::addInst(const MCInst &I, uint64_t Address, unsigned Size) {
assert(Type == TextAtom && "Trying to add MCInst to a non-text atom!");
assert(Address < End+Size &&
"Instruction not contiguous with end of atom!");
if (Address > End)
Parent->remap(this, Begin, End+Size);
Text.push_back(std::make_pair(Address, I));
void MCAtom::remap(uint64_t NewBegin, uint64_t NewEnd) {
Parent->remap(this, NewBegin, NewEnd);
void MCAtom::addData(const MCData &D) {
assert(Type == DataAtom && "Trying to add MCData to a non-data atom!");
Parent->remap(this, Begin, End+1);
void MCAtom::remapForTruncate(uint64_t TruncPt) {
assert((TruncPt >= Begin && TruncPt < End) &&
"Truncation point not contained in atom!");
remap(Begin, TruncPt);
MCAtom *MCAtom::split(uint64_t SplitPt) {
void MCAtom::remapForSplit(uint64_t SplitPt,
uint64_t &LBegin, uint64_t &LEnd,
uint64_t &RBegin, uint64_t &REnd) {
assert((SplitPt > Begin && SplitPt <= End) &&
"Splitting at point not contained in atom!");
// Compute the new begin/end points.
uint64_t LeftBegin = Begin;
uint64_t LeftEnd = SplitPt - 1;
uint64_t RightBegin = SplitPt;
uint64_t RightEnd = End;
LBegin = Begin;
LEnd = SplitPt - 1;
RBegin = SplitPt;
REnd = End;
// Remap this atom to become the lower of the two new ones.
Parent->remap(this, LeftBegin, LeftEnd);
remap(LBegin, LEnd);
// Create a new atom for the higher atom.
MCAtom *RightAtom = Parent->createAtom(Type, RightBegin, RightEnd);
// MCDataAtom
// Split the contents of the original atom between it and the new one. The
// precise method depends on whether this is a data or a text atom.
if (isDataAtom()) {
std::vector<MCData>::iterator I = Data.begin() + (RightBegin - LeftBegin);
void MCDataAtom::addData(const MCData &D) {
if (Data.size() > Begin - End)
remap(Begin, End + 1);
assert(I != Data.end() && "Split point not found in range!");
void MCDataAtom::truncate(uint64_t TruncPt) {
std::copy(I, Data.end(), RightAtom->Data.end());
Data.erase(I, Data.end());
} else if (isTextAtom()) {
std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
Data.resize(TruncPt - Begin + 1);
while (I != Text.end() && I->first < SplitPt) ++I;
MCDataAtom *MCDataAtom::split(uint64_t SplitPt) {
uint64_t LBegin, LEnd, RBegin, REnd;
remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
assert(I != Text.end() && "Split point not found in disassembly!");
assert(I->first == SplitPt &&
"Split point does not fall on instruction boundary!");
MCDataAtom *RightAtom = Parent->createDataAtom(RBegin, REnd);
std::copy(I, Text.end(), RightAtom->Text.end());
Text.erase(I, Text.end());
} else
llvm_unreachable("Unknown atom type!");
std::vector<MCData>::iterator I = Data.begin() + (RBegin - LBegin);
assert(I != Data.end() && "Split point not found in range!");
std::copy(I, Data.end(), std::back_inserter(RightAtom->Data));
Data.erase(I, Data.end());
return RightAtom;
void MCAtom::truncate(uint64_t TruncPt) {
assert((TruncPt >= Begin && TruncPt < End) &&
"Truncation point not contained in atom!");
// MCTextAtom
Parent->remap(this, Begin, TruncPt);
if (isDataAtom()) {
Data.resize(TruncPt - Begin + 1);
} else if (isTextAtom()) {
std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
while (I != Text.end() && I->first <= TruncPt) ++I;
assert(I != Text.end() && "Truncation point not found in disassembly!");
assert(I->first == TruncPt+1 &&
"Truncation point does not fall on instruction boundary");
Text.erase(I, Text.end());
} else
llvm_unreachable("Unknown atom type!");
void MCTextAtom::addInst(const MCInst &I, uint64_t Size) {
if (NextInstAddress > End)
remap(Begin, NextInstAddress);
Insts.push_back(MCDecodedInst(I, NextInstAddress, Size));
NextInstAddress += Size;
void MCTextAtom::truncate(uint64_t TruncPt) {
InstListTy::iterator I = Insts.begin();
while (I != Insts.end() && I->Address <= TruncPt) ++I;
assert(I != Insts.end() && "Truncation point not found in disassembly!");
assert(I->Address == TruncPt + 1 &&
"Truncation point does not fall on instruction boundary");
Insts.erase(I, Insts.end());
MCTextAtom *MCTextAtom::split(uint64_t SplitPt) {
uint64_t LBegin, LEnd, RBegin, REnd;
remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
MCTextAtom *RightAtom = Parent->createTextAtom(RBegin, REnd);
InstListTy::iterator I = Insts.begin();
while (I != Insts.end() && I->Address < SplitPt) ++I;
assert(I != Insts.end() && "Split point not found in disassembly!");
assert(I->Address == SplitPt &&
"Split point does not fall on instruction boundary!");
std::copy(I, Insts.end(), std::back_inserter(RightAtom->Insts));
Insts.erase(I, Insts.end());
return RightAtom;

lib/MC/MCFunction.cpp Normal file
View File

@ -0,0 +1,55 @@
//===-- lib/MC/MCFunction.cpp -----------------------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
#include "llvm/MC/MCFunction.h"
#include "llvm/MC/MCAtom.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
using namespace llvm;
// MCFunction
MCFunction::MCFunction(StringRef Name)
: Name(Name)
MCFunction::~MCFunction() {
for (iterator I = begin(), E = end(); I != E; ++I)
delete *I;
MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
Blocks.push_back(new MCBasicBlock(TA, this));
return *Blocks.back();
// MCBasicBlock
MCBasicBlock::MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent)
: Insts(&Insts), Parent(Parent)
void MCBasicBlock::addSuccessor(const MCBasicBlock *MCBB) {
bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
return std::find(Successors.begin(), Successors.end(),
MCBB) != Successors.end();
void MCBasicBlock::addPredecessor(const MCBasicBlock *MCBB) {
bool MCBasicBlock::isPredecessor(const MCBasicBlock *MCBB) const {
return std::find(Predecessors.begin(), Predecessors.end(),
MCBB) != Predecessors.end();

View File

@ -10,12 +10,13 @@
#include "llvm/MC/MCInstrAnalysis.h"
using namespace llvm;
uint64_t MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size) const {
bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
if (Inst.getNumOperands() == 0 ||
Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
return -1ULL;
return false;
int64_t Imm = Inst.getOperand(0).getImm();
return Addr+Size+Imm;
Target = Addr+Size+Imm;
return true;

View File

@ -7,39 +7,92 @@
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCModule.h"
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCFunction.h"
#include <algorithm>
using namespace llvm;
MCAtom *MCModule::createAtom(MCAtom::AtomType Type,
uint64_t Begin, uint64_t End) {
static bool AtomComp(const MCAtom *L, uint64_t Addr) {
return L->getEndAddr() < Addr;
void MCModule::map(MCAtom *NewAtom) {
uint64_t Begin = NewAtom->Begin,
End = NewAtom->End;
assert(Begin < End && "Creating MCAtom with endpoints reversed?");
// Check for atoms already covering this range.
IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Begin);
assert((!I.valid() || I.start() < End) && "Offset range already occupied!");
AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
Begin, AtomComp);
assert((I == atom_end() || (*I)->getBeginAddr() > End)
&& "Offset range already occupied!");
// Create the new atom and add it to our maps.
MCAtom *NewAtom = new MCAtom(Type, this, Begin, End);
OffsetMap.insert(Begin, End, NewAtom);
// Insert the new atom to the list.
Atoms.insert(I, NewAtom);
MCTextAtom *MCModule::createTextAtom(uint64_t Begin, uint64_t End) {
MCTextAtom *NewAtom = new MCTextAtom(this, Begin, End);
return NewAtom;
MCDataAtom *MCModule::createDataAtom(uint64_t Begin, uint64_t End) {
MCDataAtom *NewAtom = new MCDataAtom(this, Begin, End);
return NewAtom;
// remap - Update the interval mapping for an atom.
void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
// Find and erase the old mapping.
IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Atom->Begin);
assert(I.valid() && "Atom offset not found in module!");
AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
Atom->Begin, AtomComp);
assert(I != atom_end() && "Atom offset not found in module!");
assert(*I == Atom && "Previous atom mapping was invalid!");
// Insert the new mapping.
OffsetMap.insert(NewBegin, NewEnd, Atom);
AtomListTy::iterator NewI = std::lower_bound(atom_begin(), atom_end(),
NewBegin, AtomComp);
Atoms.insert(NewI, Atom);
// Update the atom internal bounds.
Atom->Begin = NewBegin;
Atom->End = NewEnd;
const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
AtomListTy::const_iterator I = std::lower_bound(atom_begin(), atom_end(),
Addr, AtomComp);
if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
return *I;
return 0;
MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
Addr, AtomComp);
if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
return *I;
return 0;
MCFunction *MCModule::createFunction(const StringRef &Name) {
Functions.push_back(new MCFunction(Name));
return Functions.back();
MCModule::~MCModule() {
for (AtomListTy::iterator AI = atom_begin(),
AE = atom_end();
AI != AE; ++AI)
delete *AI;
for (FunctionListTy::iterator FI = func_begin(),
FE = func_end();
FI != FE; ++FI)
delete *FI;

View File

@ -0,0 +1,216 @@
//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
#include "llvm/MC/MCObjectDisassembler.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCFunction.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCModule.h"
#include "llvm/Object/ObjectFile.h"
#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/StringRefMemoryObject.h"
#include "llvm/Support/raw_ostream.h"
#include <map>
#include <set>
using namespace llvm;
using namespace object;
MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
const MCDisassembler &Dis,
const MCInstrAnalysis &MIA)
: Obj(Obj), Dis(Dis), MIA(MIA) {}
MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
MCModule *Module = new MCModule;
if (withCFG)
return Module;
void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
error_code ec;
for (section_iterator SI = Obj.begin_sections(),
SE = Obj.end_sections();
SI != SE;
SI.increment(ec)) {
if (ec) break;
bool isText; SI->isText(isText);
bool isData; SI->isData(isData);
if (!isData && !isText)
uint64_t StartAddr; SI->getAddress(StartAddr);
uint64_t SecSize; SI->getSize(SecSize);
if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
StringRef Contents; SI->getContents(Contents);
StringRefMemoryObject memoryObject(Contents);
// We don't care about things like non-file-backed sections yet.
if (Contents.size() != SecSize || !SecSize)
uint64_t EndAddr = StartAddr + SecSize - 1;
StringRef SecName; SI->getName(SecName);
if (isText) {
MCTextAtom *Text = Module->createTextAtom(StartAddr, EndAddr);
uint64_t InstSize;
for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
MCInst Inst;
if (Dis.getInstruction(Inst, InstSize, memoryObject, Index,
nulls(), nulls()))
Text->addInst(Inst, InstSize);
// We don't care about splitting mixed atoms either.
llvm_unreachable("Couldn't disassemble instruction in atom.");
} else {
MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
for (uint64_t Index = 0; Index < SecSize; ++Index)
namespace {
struct BBInfo;
typedef std::set<BBInfo*> BBInfoSetTy;
struct BBInfo {
MCTextAtom *Atom;
MCBasicBlock *BB;
BBInfoSetTy Succs;
BBInfoSetTy Preds;
void addSucc(BBInfo &Succ) {
void MCObjectDisassembler::buildCFG(MCModule *Module) {
typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
BBInfoByAddrTy BBInfos;
typedef std::set<uint64_t> AddressSetTy;
AddressSetTy Splits;
AddressSetTy Calls;
assert(Module->func_begin() == Module->func_end()
&& "Module already has a CFG!");
// First, determine the basic block boundaries and call targets.
for (MCModule::atom_iterator AI = Module->atom_begin(),
AE = Module->atom_end();
AI != AE; ++AI) {
MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
if (!TA) continue;
for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
II != IE; ++II) {
if (MIA.isTerminator(II->Inst))
Splits.insert(II->Address + II->Size);
uint64_t Target;
if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
if (MIA.isCall(II->Inst))
// Split text atoms into basic block atoms.
for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
SI != SE; ++SI) {
MCAtom *A = Module->findAtomContaining(*SI);
if (!A) continue;
MCTextAtom *TA = cast<MCTextAtom>(A);
BBInfos[TA->getBeginAddr()].Atom = TA;
if (TA->getBeginAddr() == *SI)
MCTextAtom *NewAtom = TA->split(*SI);
BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;
StringRef BBName = TA->getName();
BBName = BBName.substr(0, BBName.find_last_of(':'));
NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());
// Compute succs/preds.
for (MCModule::atom_iterator AI = Module->atom_begin(),
AE = Module->atom_end();
AI != AE; ++AI) {
MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
if (!TA) continue;
BBInfo &CurBB = BBInfos[TA->getBeginAddr()];
const MCDecodedInst &LI = TA->back();
if (MIA.isBranch(LI.Inst)) {
uint64_t Target;
if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))
if (MIA.isConditionalBranch(LI.Inst))
CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
} else if (!MIA.isTerminator(LI.Inst))
CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
// Create functions and basic blocks.
for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();
CI != CE; ++CI) {
BBInfo &BBI = BBInfos[*CI];
if (!BBI.Atom) continue;
MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());
// Create MCBBs.
SmallSetVector<BBInfo*, 16> Worklist;
for (size_t WI = 0; WI < Worklist.size(); ++WI) {
BBInfo *BBI = Worklist[WI];
if (!BBI->Atom)
BBI->BB = &MCFN.createBlock(*BBI->Atom);
// Add all predecessors and successors to the worklist.
for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
SI != SE; ++SI)
for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
PI != PE; ++PI)
// Set preds/succs.
for (size_t WI = 0; WI < Worklist.size(); ++WI) {
BBInfo *BBI = Worklist[WI];
MCBasicBlock *MCBB = BBI->BB;
if (!MCBB)
for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
SI != SE; ++SI)
for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
PI != PE; ++PI)

View File

@ -48,6 +48,7 @@ add_llvm_library(LLVMSupport

View File

@ -0,0 +1,34 @@
//===- lib/Support/StringRefMemoryObject.cpp --------------------*- C++ -*-===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
#include "llvm/Support/StringRefMemoryObject.h"
using namespace llvm;
int StringRefMemoryObject::readByte(uint64_t Addr, uint8_t *Byte) const {
if (Addr >= Base + getExtent() || Addr < Base)
return -1;
*Byte = Bytes[Addr - Base];
return 0;
int StringRefMemoryObject::readBytes(uint64_t Addr,
uint64_t Size,
uint8_t *Buf,
uint64_t *Copied) const {
if (Addr >= Base + getExtent() || Addr < Base)
return -1;
uint64_t Offset = Addr - Base;
if (Size > getExtent() - Offset)
Size = getExtent() - Offset;
memcpy(Buf, + Offset, Size);
if (Copied)
*Copied = Size;
return 0;

View File

@ -136,17 +136,17 @@ public:
return MCInstrAnalysis::isConditionalBranch(Inst);
uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size) const {
bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
// FIXME: We only handle PCRel branches for now.
if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
return -1ULL;
return false;
int64_t Imm = Inst.getOperand(LblOperand).getImm();
return Addr + Imm;
Target = Addr + Imm;
return true;

View File

@ -240,15 +240,16 @@ public:
return MCInstrAnalysis::isConditionalBranch(Inst);
uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size) const {
bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
// We only handle PCRel branches for now.
if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
return -1ULL;
return false;
int64_t Imm = Inst.getOperand(0).getImm();
// FIXME: This is not right for thumb.
return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
return true;

View File

@ -12,5 +12,4 @@ add_llvm_tool(llvm-objdump

View File

@ -1,138 +0,0 @@
//===-- MCFunction.cpp ----------------------------------------------------===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file defines the algorithm to break down a region of machine code
// into basic blocks and try to reconstruct a CFG from it.
#include "MCFunction.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/system_error.h"
#include <set>
using namespace llvm;
MCFunction::createFunctionFromMC(StringRef Name, const MCDisassembler *DisAsm,
const MemoryObject &Region, uint64_t Start,
uint64_t End, const MCInstrAnalysis *Ana,
raw_ostream &DebugOut,
SmallVectorImpl<uint64_t> &Calls) {
std::vector<MCDecodedInst> Instructions;
std::set<uint64_t> Splits;
uint64_t Size;
MCFunction f(Name);
DenseSet<uint64_t> VisitedInsts;
SmallVector<uint64_t, 16> WorkList;
// Disassemble code and gather basic block split points.
while (!WorkList.empty()) {
uint64_t Index = WorkList.pop_back_val();
if (VisitedInsts.find(Index) != VisitedInsts.end())
continue; // Already visited this location.
for (;Index < End; Index += Size) {
MCInst Inst;
if (DisAsm->getInstruction(Inst, Size, Region, Index, DebugOut, nulls())){
Instructions.push_back(MCDecodedInst(Index, Size, Inst));
if (Ana->isBranch(Inst)) {
uint64_t targ = Ana->evaluateBranch(Inst, Index, Size);
if (targ != -1ULL && targ == Index+Size)
continue; // Skip nop jumps.
// If we could determine the branch target, make a note to start a
// new basic block there and add the target to the worklist.
if (targ != -1ULL) {
} else if (Ana->isReturn(Inst)) {
// Return instruction. This basic block ends here.
} else if (Ana->isCall(Inst)) {
uint64_t targ = Ana->evaluateBranch(Inst, Index, Size);
// Add the call to the call list if the destination is known.
if (targ != -1ULL && targ != Index+Size)
} else {
errs().write_hex(Index) << ": warning: invalid instruction encoding\n";
if (Size == 0)
Size = 1; // skip illegible bytes
// Make sure the instruction list is sorted.
std::sort(Instructions.begin(), Instructions.end());
// Create basic blocks.
unsigned ii = 0, ie = Instructions.size();
for (std::set<uint64_t>::iterator spi = Splits.begin(),
spe = llvm::prior(Splits.end()); spi != spe; ++spi) {
MCBasicBlock BB;
uint64_t BlockEnd = *llvm::next(spi);
// Add instructions to the BB.
for (; ii != ie; ++ii) {
if (Instructions[ii].Address < *spi ||
Instructions[ii].Address >= BlockEnd)
f.addBlock(*spi, BB);
std::sort(f.Blocks.begin(), f.Blocks.end());
// Calculate successors of each block.
for (MCFunction::iterator i = f.begin(), e = f.end(); i != e; ++i) {
MCBasicBlock &BB = const_cast<MCBasicBlock&>(i->second);
if (BB.getInsts().empty()) continue;
const MCDecodedInst &Inst = BB.getInsts().back();
if (Ana->isBranch(Inst.Inst)) {
uint64_t targ = Ana->evaluateBranch(Inst.Inst, Inst.Address, Inst.Size);
if (targ == -1ULL) {
// Indirect branch. Bail and add all blocks of the function as a
// successor.
for (MCFunction::iterator i = f.begin(), e = f.end(); i != e; ++i)
} else if (targ != Inst.Address+Inst.Size)
// Conditional branches can also fall through to the next block.
if (Ana->isConditionalBranch(Inst.Inst) && llvm::next(i) != e)
} else {
// No branch. Fall through to the next block.
if (!Ana->isReturn(Inst.Inst) && llvm::next(i) != e)
return f;

View File

@ -1,100 +0,0 @@
//===-- MCFunction.h ------------------------------------------------------===//
// The LLVM Compiler Infrastructure
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
// This file defines the data structures to hold a CFG reconstructed from
// machine code.
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/MC/MCInst.h"
#include <map>
namespace llvm {
class MCDisassembler;
class MCInstrAnalysis;
class MemoryObject;
class raw_ostream;
/// MCDecodedInst - Small container to hold an MCInst and associated info like
/// address and size.
struct MCDecodedInst {
uint64_t Address;
uint64_t Size;
MCInst Inst;
MCDecodedInst() {}
MCDecodedInst(uint64_t Address, uint64_t Size, MCInst Inst)
: Address(Address), Size(Size), Inst(Inst) {}
bool operator<(const MCDecodedInst &RHS) const {
return Address < RHS.Address;
/// MCBasicBlock - Consists of multiple MCDecodedInsts and a list of successing
/// MCBasicBlocks.
class MCBasicBlock {
std::vector<MCDecodedInst> Insts;
typedef DenseSet<uint64_t> SetTy;
SetTy Succs;
ArrayRef<MCDecodedInst> getInsts() const { return Insts; }
typedef SetTy::const_iterator succ_iterator;
succ_iterator succ_begin() const { return Succs.begin(); }
succ_iterator succ_end() const { return Succs.end(); }
bool contains(uint64_t Addr) const { return Succs.count(Addr); }
void addInst(const MCDecodedInst &Inst) { Insts.push_back(Inst); }
void addSucc(uint64_t Addr) { Succs.insert(Addr); }
bool operator<(const MCBasicBlock &RHS) const {
return Insts.size() < RHS.Insts.size();
/// MCFunction - Represents a named function in machine code, containing
/// multiple MCBasicBlocks.
class MCFunction {
const StringRef Name;
// Keep BBs sorted by address.
typedef std::vector<std::pair<uint64_t, MCBasicBlock> > MapTy;
MapTy Blocks;
MCFunction(StringRef Name) : Name(Name) {}
// Create an MCFunction from a region of binary machine code.
static MCFunction
createFunctionFromMC(StringRef Name, const MCDisassembler *DisAsm,
const MemoryObject &Region, uint64_t Start, uint64_t End,
const MCInstrAnalysis *Ana, raw_ostream &DebugOut,
SmallVectorImpl<uint64_t> &Calls);
typedef MapTy::const_iterator iterator;
iterator begin() const { return Blocks.begin(); }
iterator end() const { return Blocks.end(); }
StringRef getName() const { return Name; }
MCBasicBlock &addBlock(uint64_t Address, const MCBasicBlock &BB) {
Blocks.push_back(std::make_pair(Address, BB));
return Blocks.back().second;

View File

@ -12,9 +12,9 @@
#include "llvm-objdump.h"
#include "MCFunction.h"
#include "llvm/ADT/OwningPtr.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/DebugInfo/DIContext.h"
#include "llvm/MC/MCAsmInfo.h"
@ -43,10 +43,6 @@
using namespace llvm;
using namespace object;
static cl::opt<bool>
CFG("cfg", cl::desc("Create a CFG for every symbol in the object file and"
" write it to a graphviz file (MachO-only)"));
static cl::opt<bool>
UseDbg("g", cl::desc("Print line information from debug info if available"));
@ -91,99 +87,6 @@ struct SymbolSorter {
// Print additional information about an address, if available.
static void DumpAddress(uint64_t Address, ArrayRef<SectionRef> Sections,
const MachOObjectFile *MachOObj, raw_ostream &OS) {
for (unsigned i = 0; i != Sections.size(); ++i) {
uint64_t SectAddr = 0, SectSize = 0;
uint64_t addr = SectAddr;
if (SectAddr <= Address &&
SectAddr + SectSize > Address) {
StringRef bytes, name;
// Print constant strings.
if (!"__cstring"))
OS << '"' << bytes.substr(addr, bytes.find('\0', addr)) << '"';
// Print constant CFStrings.
if (!"__cfstring"))
OS << "@\"" << bytes.substr(addr, bytes.find('\0', addr)) << '"';
typedef std::map<uint64_t, MCFunction*> FunctionMapTy;
typedef SmallVector<MCFunction, 16> FunctionListTy;
static void createMCFunctionAndSaveCalls(StringRef Name,
const MCDisassembler *DisAsm,
MemoryObject &Object, uint64_t Start,
uint64_t End,
MCInstrAnalysis *InstrAnalysis,
uint64_t Address,
raw_ostream &DebugOut,
FunctionMapTy &FunctionMap,
FunctionListTy &Functions) {
SmallVector<uint64_t, 16> Calls;
MCFunction f =
MCFunction::createFunctionFromMC(Name, DisAsm, Object, Start, End,
InstrAnalysis, DebugOut, Calls);
FunctionMap[Address] = &Functions.back();
// Add the gathered callees to the map.
for (unsigned i = 0, e = Calls.size(); i != e; ++i)
FunctionMap.insert(std::make_pair(Calls[i], (MCFunction*)0));
// Write a graphviz file for the CFG inside an MCFunction.
static void emitDOTFile(const char *FileName, const MCFunction &f,
MCInstPrinter *IP) {
// Start a new dot file.
std::string Error;
raw_fd_ostream Out(FileName, Error);
if (!Error.empty()) {
errs() << "llvm-objdump: warning: " << Error << '\n';
Out << "digraph " << f.getName() << " {\n";
Out << "graph [ rankdir = \"LR\" ];\n";
for (MCFunction::iterator i = f.begin(), e = f.end(); i != e; ++i) {
bool hasPreds = false;
// Only print blocks that have predecessors.
// FIXME: Slow.
for (MCFunction::iterator pi = f.begin(), pe = f.end(); pi != pe;
if (pi->second.contains(i->first)) {
hasPreds = true;
if (!hasPreds && i != f.begin())
Out << '"' << i->first << "\" [ label=\"<a>";
// Print instructions.
for (unsigned ii = 0, ie = i->second.getInsts().size(); ii != ie;
++ii) {
// Escape special chars and print the instruction in mnemonic form.
std::string Str;
raw_string_ostream OS(Str);
IP->printInst(&i->second.getInsts()[ii].Inst, OS, "");
Out << DOT::EscapeString(OS.str()) << '|';
Out << "<o>\" shape=\"record\" ];\n";
// Add edges.
for (MCBasicBlock::succ_iterator si = i->second.succ_begin(),
se = i->second.succ_end(); si != se; ++si)
Out << i->first << ":o -> " << *si <<":a\n";
Out << "}\n";
static void
getSectionsAndSymbols(const macho::Header Header,
MachOObjectFile *MachOObj,
@ -272,6 +175,12 @@ static void DisassembleInputMachO2(StringRef Filename,
macho::Header Header = MachOOF->getHeader();
// FIXME: FoundFns isn't used anymore. Using symbols/LC_FUNCTION_STARTS to
// determine function locations will eventually go in MCObjectDisassembler.
// FIXME: Using the -cfg command line option, this code used to be able to
// annotate relocations with the referenced symbol's name, and if this was
// inside a __[cf]string section, the data it points to. This is now replaced
// by the upcoming MCSymbolizer, which needs the appropriate setup done above.
std::vector<SectionRef> Sections;
std::vector<SymbolRef> Symbols;
SmallVector<uint64_t, 8> FoundFns;
@ -308,31 +217,24 @@ static void DisassembleInputMachO2(StringRef Filename,
FunctionMapTy FunctionMap;
FunctionListTy Functions;
for (unsigned SectIdx = 0; SectIdx != Sections.size(); SectIdx++) {
bool SectIsText = false;
if (SectIsText == false)
StringRef SectName;
if (Sections[SectIdx].getName(SectName) ||
SectName != "__text")
continue; // Skip non-text sections
DataRefImpl DR = Sections[SectIdx].getRawDataRefImpl();
StringRef SegmentName = MachOOF->getSectionFinalSegmentName(DR);
if (SegmentName != "__TEXT")
// Insert the functions from the function starts segment into our map.
uint64_t VMAddr;
for (unsigned i = 0, e = FoundFns.size(); i != e; ++i) {
StringRef SectBegin;
uint64_t Offset = (uint64_t);
FunctionMap.insert(std::make_pair(VMAddr + FoundFns[i]-Offset,
StringRef Bytes;
StringRefMemoryObject memoryObject(Bytes);
@ -403,52 +305,39 @@ static void DisassembleInputMachO2(StringRef Filename,
symbolTableWorked = true;
if (!CFG) {
// Normal disassembly, print addresses, bytes and mnemonic form.
StringRef SymName;
outs() << SymName << ":\n";
DILineInfo lastLine;
for (uint64_t Index = Start; Index < End; Index += Size) {
MCInst Inst;
outs() << SymName << ":\n";
DILineInfo lastLine;
for (uint64_t Index = Start; Index < End; Index += Size) {
MCInst Inst;
if (DisAsm->getInstruction(Inst, Size, memoryObject, Index,
DebugOut, nulls())) {
uint64_t SectAddress = 0;
outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
if (DisAsm->getInstruction(Inst, Size, memoryObject, Index,
DebugOut, nulls())) {
uint64_t SectAddress = 0;
outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
DumpBytes(StringRef( + Index, Size));
IP->printInst(&Inst, outs(), "");
DumpBytes(StringRef( + Index, Size));
IP->printInst(&Inst, outs(), "");
// Print debug info.
if (diContext) {
DILineInfo dli =
diContext->getLineInfoForAddress(SectAddress + Index);
// Print valid line info if it changed.
if (dli != lastLine && dli.getLine() != 0)
outs() << "\t## " << dli.getFileName() << ':'
<< dli.getLine() << ':' << dli.getColumn();
lastLine = dli;
outs() << "\n";
} else {
errs() << "llvm-objdump: warning: invalid instruction encoding\n";
if (Size == 0)
Size = 1; // skip illegible bytes
// Print debug info.
if (diContext) {
DILineInfo dli =
diContext->getLineInfoForAddress(SectAddress + Index);
// Print valid line info if it changed.
if (dli != lastLine && dli.getLine() != 0)
outs() << "\t## " << dli.getFileName() << ':'
<< dli.getLine() << ':' << dli.getColumn();
lastLine = dli;
outs() << "\n";
} else {
errs() << "llvm-objdump: warning: invalid instruction encoding\n";
if (Size == 0)
Size = 1; // skip illegible bytes
} else {
// Create CFG and use it for disassembly.
StringRef SymName;
SymName, DisAsm.get(), memoryObject, Start, End,
InstrAnalysis.get(), Start, DebugOut, FunctionMap, Functions);
if (!CFG && !symbolTableWorked) {
if (!symbolTableWorked) {
// Reading the symbol table didn't work, disassemble the whole section.
uint64_t SectAddress;
@ -471,142 +360,5 @@ static void DisassembleInputMachO2(StringRef Filename,
if (CFG) {
if (!symbolTableWorked) {
// Reading the symbol table didn't work, create a big __TEXT symbol.
uint64_t SectSize = 0, SectAddress = 0;
createMCFunctionAndSaveCalls("__TEXT", DisAsm.get(), memoryObject,
0, SectSize,
SectAddress, DebugOut,
FunctionMap, Functions);
for (std::map<uint64_t, MCFunction*>::iterator mi = FunctionMap.begin(),
me = FunctionMap.end(); mi != me; ++mi)
if (mi->second == 0) {
// Create functions for the remaining callees we have gathered,
// but we didn't find a name for them.
uint64_t SectSize = 0;
SmallVector<uint64_t, 16> Calls;
MCFunction f =
MCFunction::createFunctionFromMC("unknown", DisAsm.get(),
memoryObject, mi->first,
InstrAnalysis.get(), DebugOut,
mi->second = &Functions.back();
for (unsigned i = 0, e = Calls.size(); i != e; ++i) {
std::pair<uint64_t, MCFunction*> p(Calls[i], (MCFunction*)0);
if (FunctionMap.insert(p).second)
mi = FunctionMap.begin();
DenseSet<uint64_t> PrintedBlocks;
for (unsigned ffi = 0, ffe = Functions.size(); ffi != ffe; ++ffi) {
MCFunction &f = Functions[ffi];
for (MCFunction::iterator fi = f.begin(), fe = f.end(); fi != fe; ++fi){
if (!PrintedBlocks.insert(fi->first).second)
continue; // We already printed this block.
// We assume a block has predecessors when it's the first block after
// a symbol.
bool hasPreds = FunctionMap.find(fi->first) != FunctionMap.end();
// See if this block has predecessors.
// FIXME: Slow.
for (MCFunction::iterator pi = f.begin(), pe = f.end(); pi != pe;
if (pi->second.contains(fi->first)) {
hasPreds = true;
uint64_t SectSize = 0, SectAddress;
// No predecessors, this is a data block. Print as .byte directives.
if (!hasPreds) {
uint64_t End = llvm::next(fi) == fe ? SectSize :
outs() << "# " << End-fi->first << " bytes of data:\n";
for (unsigned pos = fi->first; pos != End; ++pos) {
outs() << format("%8x:\t", SectAddress + pos);
DumpBytes(StringRef( + pos, 1));
outs() << format("\t.byte 0x%02x\n", (uint8_t)Bytes[pos]);
if (fi->second.contains(fi->first)) // Print a header for simple loops
outs() << "# Loop begin:\n";
DILineInfo lastLine;
// Walk over the instructions and print them.
for (unsigned ii = 0, ie = fi->second.getInsts().size(); ii != ie;
++ii) {
const MCDecodedInst &Inst = fi->second.getInsts()[ii];
// If there's a symbol at this address, print its name.
if (FunctionMap.find(SectAddress + Inst.Address) !=
outs() << FunctionMap[SectAddress + Inst.Address]-> getName()
<< ":\n";
outs() << format("%8" PRIx64 ":\t", SectAddress + Inst.Address);
DumpBytes(StringRef( + Inst.Address, Inst.Size));
if (fi->second.contains(fi->first)) // Indent simple loops.
outs() << '\t';
IP->printInst(&Inst.Inst, outs(), "");
// Look for relocations inside this instructions, if there is one
// print its target and additional information if available.
for (unsigned j = 0; j != Relocs.size(); ++j)
if (Relocs[j].first >= SectAddress + Inst.Address &&
Relocs[j].first < SectAddress + Inst.Address + Inst.Size) {
StringRef SymName;
uint64_t Addr;
outs() << "\t# " << SymName << ' ';
DumpAddress(Addr, Sections, MachOOF, outs());
// If this instructions contains an address, see if we can evaluate
// it and print additional information.
uint64_t targ = InstrAnalysis->evaluateBranch(Inst.Inst,
if (targ != -1ULL)
DumpAddress(targ, Sections, MachOOF, outs());
// Print debug info.
if (diContext) {
DILineInfo dli =
diContext->getLineInfoForAddress(SectAddress + Inst.Address);
// Print valid line info if it changed.
if (dli != lastLine && dli.getLine() != 0)
outs() << "\t## " << dli.getFileName() << ':'
<< dli.getLine() << ':' << dli.getColumn();
lastLine = dli;
outs() << '\n';
emitDOTFile((f.getName().str() + ".dot").c_str(), f, IP.get());

View File

@ -17,22 +17,26 @@
#include "llvm-objdump.h"
#include "MCFunction.h"
#include "llvm/ADT/OwningPtr.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAtom.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCFunction.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstPrinter.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCModule.h"
#include "llvm/MC/MCObjectDisassembler.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCObjectSymbolizer.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCRelocationInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Object/Archive.h"
#include "llvm/Object/COFF.h"
#include "llvm/Object/MachO.h"
@ -131,6 +135,10 @@ static cl::opt<bool>
Symbolize("symbolize", cl::desc("When disassembling instructions, "
"try to symbolize operands."));
static cl::opt<bool>
CFG("cfg", cl::desc("Create a CFG for every function found in the object"
" and write it to a graphviz file"));
static StringRef ToolName;
bool llvm::error(error_code ec) {
@ -169,7 +177,51 @@ static const Target *getTarget(const ObjectFile *Obj = NULL) {
return TheTarget;
void llvm::StringRefMemoryObject::anchor() { }
// Write a graphviz file for the CFG inside an MCFunction.
static void emitDOTFile(const char *FileName, const MCFunction &f,
MCInstPrinter *IP) {
// Start a new dot file.
std::string Error;
raw_fd_ostream Out(FileName, Error);
if (!Error.empty()) {
errs() << "llvm-objdump: warning: " << Error << '\n';
Out << "digraph \"" << f.getName() << "\" {\n";
Out << "graph [ rankdir = \"LR\" ];\n";
for (MCFunction::const_iterator i = f.begin(), e = f.end(); i != e; ++i) {
// Only print blocks that have predecessors.
bool hasPreds = (*i)->pred_begin() != (*i)->pred_end();
if (!hasPreds && i != f.begin())
Out << '"' << (*i)->getInsts()->getBeginAddr() << "\" [ label=\"<a>";
// Print instructions.
for (unsigned ii = 0, ie = (*i)->getInsts()->size(); ii != ie;
++ii) {
if (ii != 0) // Not the first line, start a new row.
Out << '|';
if (ii + 1 == ie) // Last line, add an end id.
Out << "<o>";
// Escape special chars and print the instruction in mnemonic form.
std::string Str;
raw_string_ostream OS(Str);
IP->printInst(&(*i)->getInsts()->at(ii).Inst, OS, "");
Out << DOT::EscapeString(OS.str());
Out << "\" shape=\"record\" ];\n";
// Add edges.
for (MCBasicBlock::succ_const_iterator si = (*i)->succ_begin(),
se = (*i)->succ_end(); si != se; ++si)
Out << (*i)->getInsts()->getBeginAddr() << ":o -> "
<< (*si)->getInsts()->getBeginAddr() << ":a\n";
Out << "}\n";
void llvm::DumpBytes(StringRef bytes) {
static const char hex_rep[] = "0123456789abcdef";
@ -269,6 +321,9 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
OwningPtr<const MCInstrAnalysis>
int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
OwningPtr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
@ -278,6 +333,34 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
if (CFG) {
OwningPtr<MCObjectDisassembler> OD(
new MCObjectDisassembler(*Obj, *DisAsm, *MIA));
OwningPtr<MCModule> Mod(OD->buildModule(/* withCFG */ true));
for (MCModule::const_atom_iterator AI = Mod->atom_begin(),
AE = Mod->atom_end();
AI != AE; ++AI) {
outs() << "Atom " << (*AI)->getName() << ": \n";
if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI)) {
for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
II != IE;
++II) {
IP->printInst(&II->Inst, outs(), "");
outs() << "\n";
for (MCModule::const_func_iterator FI = Mod->func_begin(),
FE = Mod->func_end();
FI != FE; ++FI) {
static int filenum = 0;
emitDOTFile((Twine((*FI)->getName()) + "_" +
utostr(filenum++) + ".dot").str().c_str(),
**FI, IP.get());
error_code ec;
for (section_iterator i = Obj->begin_sections(),
e = Obj->end_sections();

View File

@ -13,7 +13,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/MemoryObject.h"
#include "llvm/Support/StringRefMemoryObject.h"
namespace llvm {
@ -35,25 +35,6 @@ void DisassembleInputMachO(StringRef Filename);
void printCOFFUnwindInfo(const object::COFFObjectFile* o);
void printELFFileHeader(const object::ObjectFile *o);
class StringRefMemoryObject : public MemoryObject {
virtual void anchor();
StringRef Bytes;
uint64_t Base;
StringRefMemoryObject(StringRef bytes, uint64_t Base = 0)
: Bytes(bytes), Base(Base) {}
uint64_t getBase() const { return Base; }
uint64_t getExtent() const { return Bytes.size(); }
int readByte(uint64_t Addr, uint8_t *Byte) const {
if (Addr >= Base + getExtent() || Addr < Base)
return -1;
*Byte = Bytes[Addr - Base];
return 0;