2011-01-20 06:39:06 +00:00
|
|
|
//===-- llvm-objdump.cpp - Object file dumping utility for llvm -----------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This program is a utility that works like binutils "objdump", that is, it
|
|
|
|
// dumps out a plethora of information about an object file depending on the
|
|
|
|
// flags.
|
|
|
|
//
|
2013-02-05 20:27:22 +00:00
|
|
|
// The flags and output of this program should be near identical to those of
|
|
|
|
// binutils objdump.
|
|
|
|
//
|
2011-01-20 06:39:06 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
#include "llvm-objdump.h"
|
2012-12-04 10:44:52 +00:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
2011-10-17 17:13:22 +00:00
|
|
|
#include "llvm/ADT/StringExtras.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/ADT/Triple.h"
|
|
|
|
#include "llvm/MC/MCAsmInfo.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCAtom.h"
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
#include "llvm/MC/MCContext.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/MC/MCDisassembler.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCFunction.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/MC/MCInst.h"
|
|
|
|
#include "llvm/MC/MCInstPrinter.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCInstrAnalysis.h"
|
2012-04-02 06:09:36 +00:00
|
|
|
#include "llvm/MC/MCInstrInfo.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCModule.h"
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
#include "llvm/MC/MCModuleYAML.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCObjectDisassembler.h"
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
#include "llvm/MC/MCObjectFileInfo.h"
|
|
|
|
#include "llvm/MC/MCObjectSymbolizer.h"
|
2012-03-05 19:33:20 +00:00
|
|
|
#include "llvm/MC/MCRegisterInfo.h"
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
#include "llvm/MC/MCRelocationInfo.h"
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
#include "llvm/MC/MCSubtargetInfo.h"
|
2012-12-04 10:44:52 +00:00
|
|
|
#include "llvm/Object/Archive.h"
|
|
|
|
#include "llvm/Object/COFF.h"
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
#include "llvm/Object/MachO.h"
|
2012-12-04 10:44:52 +00:00
|
|
|
#include "llvm/Object/ObjectFile.h"
|
2011-10-08 00:18:30 +00:00
|
|
|
#include "llvm/Support/Casting.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
2011-10-08 00:18:30 +00:00
|
|
|
#include "llvm/Support/FileSystem.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/Support/Format.h"
|
2011-07-25 23:04:36 +00:00
|
|
|
#include "llvm/Support/GraphWriter.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/Support/Host.h"
|
|
|
|
#include "llvm/Support/ManagedStatic.h"
|
|
|
|
#include "llvm/Support/MemoryBuffer.h"
|
|
|
|
#include "llvm/Support/MemoryObject.h"
|
|
|
|
#include "llvm/Support/PrettyStackTrace.h"
|
|
|
|
#include "llvm/Support/Signals.h"
|
|
|
|
#include "llvm/Support/SourceMgr.h"
|
2011-08-24 18:08:43 +00:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
|
|
|
#include "llvm/Support/TargetSelect.h"
|
2011-01-20 06:39:06 +00:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include "llvm/Support/system_error.h"
|
|
|
|
#include <algorithm>
|
2012-03-23 11:49:32 +00:00
|
|
|
#include <cctype>
|
2011-01-20 06:39:06 +00:00
|
|
|
#include <cstring>
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
|
2011-01-20 06:39:06 +00:00
|
|
|
using namespace llvm;
|
|
|
|
using namespace object;
|
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
static cl::list<std::string>
|
|
|
|
InputFilenames(cl::Positional, cl::desc("<input object files>"),cl::ZeroOrMore);
|
2011-01-20 06:39:06 +00:00
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
Disassemble("disassemble",
|
|
|
|
cl::desc("Display assembler mnemonics for the machine instructions"));
|
|
|
|
static cl::alias
|
|
|
|
Disassembled("d", cl::desc("Alias for --disassemble"),
|
|
|
|
cl::aliasopt(Disassemble));
|
2011-01-20 06:39:06 +00:00
|
|
|
|
2011-10-08 00:18:30 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
Relocations("r", cl::desc("Display the relocation entries in the file"));
|
|
|
|
|
2011-10-17 17:13:22 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
SectionContents("s", cl::desc("Display the content of each section"));
|
|
|
|
|
2011-10-18 19:32:17 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
SymbolTable("t", cl::desc("Display the symbol table"));
|
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
static cl::opt<bool>
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
MachOOpt("macho", cl::desc("Use MachO specific object file parser"));
|
2011-09-19 17:56:04 +00:00
|
|
|
static cl::alias
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
MachOm("m", cl::desc("Alias for --macho"), cl::aliasopt(MachOOpt));
|
2011-07-20 19:37:35 +00:00
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
cl::opt<std::string>
|
|
|
|
llvm::TripleName("triple", cl::desc("Target triple to disassemble for, "
|
|
|
|
"see -version for available targets"));
|
2011-01-20 06:39:06 +00:00
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
cl::opt<std::string>
|
|
|
|
llvm::ArchName("arch", cl::desc("Target arch to disassemble for, "
|
|
|
|
"see -version for available targets"));
|
2011-01-20 06:39:06 +00:00
|
|
|
|
2011-10-10 21:21:34 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
SectionHeaders("section-headers", cl::desc("Display summaries of the headers "
|
|
|
|
"for each section."));
|
|
|
|
static cl::alias
|
|
|
|
SectionHeadersShort("headers", cl::desc("Alias for --section-headers"),
|
|
|
|
cl::aliasopt(SectionHeaders));
|
|
|
|
static cl::alias
|
|
|
|
SectionHeadersShorter("h", cl::desc("Alias for --section-headers"),
|
|
|
|
cl::aliasopt(SectionHeaders));
|
|
|
|
|
2012-08-28 19:24:49 +00:00
|
|
|
static cl::list<std::string>
|
|
|
|
MAttrs("mattr",
|
|
|
|
cl::CommaSeparated,
|
|
|
|
cl::desc("Target specific attributes"),
|
|
|
|
cl::value_desc("a1,+a2,-a3,..."));
|
|
|
|
|
2012-11-20 22:57:02 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling instructions, "
|
|
|
|
"do not print the instruction bytes."));
|
|
|
|
|
2012-12-05 20:12:35 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
UnwindInfo("unwind-info", cl::desc("Display unwind information"));
|
|
|
|
|
|
|
|
static cl::alias
|
|
|
|
UnwindInfoShort("u", cl::desc("Alias for --unwind-info"),
|
|
|
|
cl::aliasopt(UnwindInfo));
|
|
|
|
|
2013-01-06 03:56:49 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
PrivateHeaders("private-headers",
|
|
|
|
cl::desc("Display format specific file headers"));
|
|
|
|
|
|
|
|
static cl::alias
|
|
|
|
PrivateHeadersShort("p", cl::desc("Alias for --private-headers"),
|
|
|
|
cl::aliasopt(PrivateHeaders));
|
|
|
|
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
Symbolize("symbolize", cl::desc("When disassembling instructions, "
|
|
|
|
"try to symbolize operands."));
|
|
|
|
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
static cl::opt<bool>
|
|
|
|
CFG("cfg", cl::desc("Create a CFG for every function found in the object"
|
|
|
|
" and write it to a graphviz file"));
|
|
|
|
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
// FIXME: Does it make sense to have a dedicated tool for yaml cfg output?
|
|
|
|
static cl::opt<std::string>
|
|
|
|
YAMLCFG("yaml-cfg",
|
|
|
|
cl::desc("Create a CFG and write it as a YAML MCModule."),
|
|
|
|
cl::value_desc("yaml output file"));
|
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
static StringRef ToolName;
|
2011-06-25 17:55:23 +00:00
|
|
|
|
2014-01-25 00:32:01 +00:00
|
|
|
bool llvm::error(error_code EC) {
|
|
|
|
if (!EC)
|
|
|
|
return false;
|
2011-06-25 17:55:23 +00:00
|
|
|
|
2014-01-25 00:32:01 +00:00
|
|
|
outs() << ToolName << ": error reading file: " << EC.message() << ".\n";
|
2011-09-19 17:56:04 +00:00
|
|
|
outs().flush();
|
|
|
|
return true;
|
2011-01-20 06:39:06 +00:00
|
|
|
}
|
|
|
|
|
2012-08-07 17:53:14 +00:00
|
|
|
static const Target *getTarget(const ObjectFile *Obj = NULL) {
|
2011-01-20 06:39:06 +00:00
|
|
|
// Figure out the target triple.
|
2012-05-08 23:38:45 +00:00
|
|
|
llvm::Triple TheTriple("unknown-unknown-unknown");
|
2011-01-20 07:22:04 +00:00
|
|
|
if (TripleName.empty()) {
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
if (Obj) {
|
2012-05-08 23:38:45 +00:00
|
|
|
TheTriple.setArch(Triple::ArchType(Obj->getArch()));
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
// TheTriple defaults to ELF, and COFF doesn't have an environment:
|
|
|
|
// the best we can do here is indicate that it is mach-o.
|
|
|
|
if (Obj->isMachO())
|
2014-03-06 20:47:11 +00:00
|
|
|
TheTriple.setObjectFormat(Triple::MachO);
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
}
|
2011-01-20 07:22:04 +00:00
|
|
|
} else
|
2012-05-08 23:38:45 +00:00
|
|
|
TheTriple.setTriple(Triple::normalize(TripleName));
|
2011-01-20 06:39:06 +00:00
|
|
|
|
|
|
|
// Get the target specific parser.
|
|
|
|
std::string Error;
|
2012-05-08 23:38:45 +00:00
|
|
|
const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
|
|
|
|
Error);
|
|
|
|
if (!TheTarget) {
|
|
|
|
errs() << ToolName << ": " << Error;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the triple name and return the found target.
|
|
|
|
TripleName = TheTriple.getTriple();
|
|
|
|
return TheTarget;
|
2011-01-20 06:39:06 +00:00
|
|
|
}
|
|
|
|
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
// Write a graphviz file for the CFG inside an MCFunction.
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
// FIXME: Use GraphWriter
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
static void emitDOTFile(const char *FileName, const MCFunction &f,
|
|
|
|
MCInstPrinter *IP) {
|
|
|
|
// Start a new dot file.
|
|
|
|
std::string Error;
|
2014-02-24 18:20:12 +00:00
|
|
|
raw_fd_ostream Out(FileName, Error, sys::fs::F_Text);
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
if (!Error.empty()) {
|
|
|
|
errs() << "llvm-objdump: warning: " << Error << '\n';
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Out << "digraph \"" << f.getName() << "\" {\n";
|
|
|
|
Out << "graph [ rankdir = \"LR\" ];\n";
|
|
|
|
for (MCFunction::const_iterator i = f.begin(), e = f.end(); i != e; ++i) {
|
|
|
|
// Only print blocks that have predecessors.
|
|
|
|
bool hasPreds = (*i)->pred_begin() != (*i)->pred_end();
|
|
|
|
|
|
|
|
if (!hasPreds && i != f.begin())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Out << '"' << (*i)->getInsts()->getBeginAddr() << "\" [ label=\"<a>";
|
|
|
|
// Print instructions.
|
|
|
|
for (unsigned ii = 0, ie = (*i)->getInsts()->size(); ii != ie;
|
|
|
|
++ii) {
|
|
|
|
if (ii != 0) // Not the first line, start a new row.
|
|
|
|
Out << '|';
|
|
|
|
if (ii + 1 == ie) // Last line, add an end id.
|
|
|
|
Out << "<o>";
|
|
|
|
|
|
|
|
// Escape special chars and print the instruction in mnemonic form.
|
|
|
|
std::string Str;
|
|
|
|
raw_string_ostream OS(Str);
|
|
|
|
IP->printInst(&(*i)->getInsts()->at(ii).Inst, OS, "");
|
|
|
|
Out << DOT::EscapeString(OS.str());
|
|
|
|
}
|
|
|
|
Out << "\" shape=\"record\" ];\n";
|
|
|
|
|
|
|
|
// Add edges.
|
|
|
|
for (MCBasicBlock::succ_const_iterator si = (*i)->succ_begin(),
|
|
|
|
se = (*i)->succ_end(); si != se; ++si)
|
|
|
|
Out << (*i)->getInsts()->getBeginAddr() << ":o -> "
|
|
|
|
<< (*si)->getInsts()->getBeginAddr() << ":a\n";
|
|
|
|
}
|
|
|
|
Out << "}\n";
|
|
|
|
}
|
2011-12-20 02:50:00 +00:00
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
void llvm::DumpBytes(StringRef bytes) {
|
|
|
|
static const char hex_rep[] = "0123456789abcdef";
|
2011-01-20 06:39:06 +00:00
|
|
|
// FIXME: The real way to do this is to figure out the longest instruction
|
|
|
|
// and align to that size before printing. I'll fix this when I get
|
|
|
|
// around to outputting relocations.
|
|
|
|
// 15 is the longest x86 instruction
|
|
|
|
// 3 is for the hex rep of a byte + a space.
|
|
|
|
// 1 is for the null terminator.
|
|
|
|
enum { OutputSize = (15 * 3) + 1 };
|
|
|
|
char output[OutputSize];
|
|
|
|
|
|
|
|
assert(bytes.size() <= 15
|
|
|
|
&& "DumpBytes only supports instructions of up to 15 bytes");
|
|
|
|
memset(output, ' ', sizeof(output));
|
|
|
|
unsigned index = 0;
|
|
|
|
for (StringRef::iterator i = bytes.begin(),
|
|
|
|
e = bytes.end(); i != e; ++i) {
|
|
|
|
output[index] = hex_rep[(*i & 0xF0) >> 4];
|
|
|
|
output[index + 1] = hex_rep[*i & 0xF];
|
|
|
|
index += 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
output[sizeof(output) - 1] = 0;
|
|
|
|
outs() << output;
|
|
|
|
}
|
|
|
|
|
2012-12-05 20:12:35 +00:00
|
|
|
bool llvm::RelocAddressLess(RelocationRef a, RelocationRef b) {
|
2011-10-13 22:17:18 +00:00
|
|
|
uint64_t a_addr, b_addr;
|
2013-04-25 12:28:45 +00:00
|
|
|
if (error(a.getOffset(a_addr))) return false;
|
|
|
|
if (error(b.getOffset(b_addr))) return false;
|
2011-10-13 22:17:18 +00:00
|
|
|
return a_addr < b_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
|
2012-08-07 17:53:14 +00:00
|
|
|
const Target *TheTarget = getTarget(Obj);
|
|
|
|
// getTarget() will have already issued a diagnostic if necessary, so
|
|
|
|
// just bail here if it failed.
|
|
|
|
if (!TheTarget)
|
2011-01-20 06:39:06 +00:00
|
|
|
return;
|
|
|
|
|
2012-08-28 19:24:49 +00:00
|
|
|
// Package up features to be passed to target/subtarget
|
|
|
|
std::string FeaturesStr;
|
|
|
|
if (MAttrs.size()) {
|
|
|
|
SubtargetFeatures Features;
|
|
|
|
for (unsigned i = 0; i != MAttrs.size(); ++i)
|
|
|
|
Features.AddFeature(MAttrs[i]);
|
|
|
|
FeaturesStr = Features.getString();
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCRegisterInfo> MRI(
|
|
|
|
TheTarget->createMCRegInfo(TripleName));
|
2013-05-16 21:28:23 +00:00
|
|
|
if (!MRI) {
|
|
|
|
errs() << "error: no register info for target " << TripleName << "\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set up disassembler.
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCAsmInfo> AsmInfo(
|
|
|
|
TheTarget->createMCAsmInfo(*MRI, TripleName));
|
2013-05-16 21:28:23 +00:00
|
|
|
if (!AsmInfo) {
|
|
|
|
errs() << "error: no assembly info for target " << TripleName << "\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCSubtargetInfo> STI(
|
|
|
|
TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
|
2013-05-16 21:28:23 +00:00
|
|
|
if (!STI) {
|
|
|
|
errs() << "error: no subtarget info for target " << TripleName << "\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCInstrInfo> MII(TheTarget->createMCInstrInfo());
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
if (!MII) {
|
|
|
|
errs() << "error: no instruction info for target " << TripleName << "\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<MCDisassembler> DisAsm(TheTarget->createMCDisassembler(*STI));
|
2013-05-16 21:28:23 +00:00
|
|
|
if (!DisAsm) {
|
|
|
|
errs() << "error: no disassembler for target " << TripleName << "\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCObjectFileInfo> MOFI;
|
|
|
|
std::unique_ptr<MCContext> Ctx;
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
|
|
|
|
if (Symbolize) {
|
|
|
|
MOFI.reset(new MCObjectFileInfo);
|
2013-06-18 07:20:20 +00:00
|
|
|
Ctx.reset(new MCContext(AsmInfo.get(), MRI.get(), MOFI.get()));
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<MCRelocationInfo> RelInfo(
|
|
|
|
TheTarget->createMCRelocationInfo(TripleName, *Ctx.get()));
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
if (RelInfo) {
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<MCSymbolizer> Symzer(
|
|
|
|
MCObjectSymbolizer::createObjectSymbolizer(*Ctx.get(), RelInfo, Obj));
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
if (Symzer)
|
2014-03-07 09:38:02 +00:00
|
|
|
DisAsm->setSymbolizer(std::move(Symzer));
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
}
|
2013-05-16 21:28:23 +00:00
|
|
|
}
|
|
|
|
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<const MCInstrAnalysis> MIA(
|
|
|
|
TheTarget->createMCInstrAnalysis(MII.get()));
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
|
2013-05-16 21:28:23 +00:00
|
|
|
int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
|
2013-05-16 21:28:23 +00:00
|
|
|
AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
|
|
|
|
if (!IP) {
|
|
|
|
errs() << "error: no instruction printer for target " << TripleName
|
|
|
|
<< '\n';
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
if (CFG || !YAMLCFG.empty()) {
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<MCObjectDisassembler> OD(
|
|
|
|
new MCObjectDisassembler(*Obj, *DisAsm, *MIA));
|
|
|
|
std::unique_ptr<MCModule> Mod(OD->buildModule(/* withCFG */ true));
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
for (MCModule::const_atom_iterator AI = Mod->atom_begin(),
|
|
|
|
AE = Mod->atom_end();
|
|
|
|
AI != AE; ++AI) {
|
|
|
|
outs() << "Atom " << (*AI)->getName() << ": \n";
|
|
|
|
if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI)) {
|
|
|
|
for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
|
|
|
|
II != IE;
|
|
|
|
++II) {
|
|
|
|
IP->printInst(&II->Inst, outs(), "");
|
|
|
|
outs() << "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
if (CFG) {
|
|
|
|
for (MCModule::const_func_iterator FI = Mod->func_begin(),
|
|
|
|
FE = Mod->func_end();
|
|
|
|
FI != FE; ++FI) {
|
|
|
|
static int filenum = 0;
|
|
|
|
emitDOTFile((Twine((*FI)->getName()) + "_" +
|
|
|
|
utostr(filenum) + ".dot").str().c_str(),
|
|
|
|
**FI, IP.get());
|
|
|
|
++filenum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!YAMLCFG.empty()) {
|
|
|
|
std::string Error;
|
2014-02-24 18:20:12 +00:00
|
|
|
raw_fd_ostream YAMLOut(YAMLCFG.c_str(), Error, sys::fs::F_Text);
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
if (!Error.empty()) {
|
2013-08-21 16:13:25 +00:00
|
|
|
errs() << ToolName << ": warning: " << Error << '\n';
|
MC CFG: Add YAML MCModule representation to enable MC CFG testing.
Like yaml ObjectFiles, this will be very useful for testing the MC CFG
implementation (mostly MCObjectDisassembler), by matching the output
with YAML, and for potential users of the MC CFG, by using it as an input.
There isn't much to the actual format, it is just a serialization of the
MCModule class. Of note:
- Basic block references (pred/succ, ..) are represented by the BB's
start address.
- Just as in the MC CFG, instructions are MCInsts with a size.
- Operands have a prefix representing the type (only register and
immediate supported here).
- Instruction opcodes are represented by their names; enum values aren't
stable, enum names mostly are: usually, a change to a name would need
lots of changes in the backend anyway.
Same with registers.
All in all, an example is better than 1000 words, here goes:
A simple binary:
Disassembly of section __TEXT,__text:
_main:
100000f9c: 48 8b 46 08 movq 8(%rsi), %rax
100000fa0: 0f be 00 movsbl (%rax), %eax
100000fa3: 3b 04 25 48 00 00 00 cmpl 72, %eax
100000faa: 0f 8c 07 00 00 00 jl 7 <.Lend>
100000fb0: 2b 04 25 48 00 00 00 subl 72, %eax
.Lend:
100000fb7: c3 ret
And the (pretty verbose) generated YAML:
---
Atoms:
- StartAddress: 0x0000000100000F9C
Size: 20
Type: Text
Content:
- Inst: MOV64rm
Size: 4
Ops: [ RRAX, RRSI, I1, R, I8, R ]
- Inst: MOVSX32rm8
Size: 3
Ops: [ REAX, RRAX, I1, R, I0, R ]
- Inst: CMP32rm
Size: 7
Ops: [ REAX, R, I1, R, I72, R ]
- Inst: JL_4
Size: 6
Ops: [ I7 ]
- StartAddress: 0x0000000100000FB0
Size: 7
Type: Text
Content:
- Inst: SUB32rm
Size: 7
Ops: [ REAX, REAX, R, I1, R, I72, R ]
- StartAddress: 0x0000000100000FB7
Size: 1
Type: Text
Content:
- Inst: RET
Size: 1
Ops: [ ]
Functions:
- Name: __text
BasicBlocks:
- Address: 0x0000000100000F9C
Preds: [ ]
Succs: [ 0x0000000100000FB7, 0x0000000100000FB0 ]
<snip>
...
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188890 91177308-0d34-0410-b5e6-96231b3b80d8
2013-08-21 07:29:02 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
mcmodule2yaml(YAMLOut, *Mod, *MII, *MRI);
|
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 01:07:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-25 17:38:19 +00:00
|
|
|
// Create a mapping, RelocSecs = SectionRelocMap[S], where sections
|
|
|
|
// in RelocSecs contain the relocations for section S.
|
2014-01-25 00:32:01 +00:00
|
|
|
error_code EC;
|
2014-03-13 14:37:36 +00:00
|
|
|
std::map<SectionRef, SmallVector<SectionRef, 1>> SectionRelocMap;
|
|
|
|
for (const SectionRef &Section : Obj->sections()) {
|
|
|
|
section_iterator Sec2 = Section.getRelocatedSection();
|
2014-02-10 20:24:04 +00:00
|
|
|
if (Sec2 != Obj->section_end())
|
2014-03-13 14:37:36 +00:00
|
|
|
SectionRelocMap[*Sec2].push_back(Section);
|
2014-01-25 17:38:19 +00:00
|
|
|
}
|
|
|
|
|
2014-03-13 14:37:36 +00:00
|
|
|
for (const SectionRef &Section : Obj->sections()) {
|
2014-01-25 00:32:01 +00:00
|
|
|
bool Text;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.isText(Text)))
|
2014-01-25 00:32:01 +00:00
|
|
|
break;
|
|
|
|
if (!Text)
|
|
|
|
continue;
|
2011-06-25 17:55:23 +00:00
|
|
|
|
2011-10-13 22:17:18 +00:00
|
|
|
uint64_t SectionAddr;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getAddress(SectionAddr)))
|
2014-01-25 00:32:01 +00:00
|
|
|
break;
|
2011-10-13 22:17:18 +00:00
|
|
|
|
2014-02-24 22:12:11 +00:00
|
|
|
uint64_t SectSize;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getSize(SectSize)))
|
2014-02-24 22:12:11 +00:00
|
|
|
break;
|
|
|
|
|
2011-07-15 18:39:24 +00:00
|
|
|
// Make a list of all the symbols in this section.
|
|
|
|
std::vector<std::pair<uint64_t, StringRef> > Symbols;
|
2014-02-10 20:24:04 +00:00
|
|
|
for (symbol_iterator SI = Obj->symbol_begin(), SE = Obj->symbol_end();
|
2014-01-30 02:49:50 +00:00
|
|
|
SI != SE; ++SI) {
|
2011-07-15 18:39:24 +00:00
|
|
|
bool contains;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (!error(Section.containsSymbol(*SI, contains)) && contains) {
|
2011-07-15 18:39:24 +00:00
|
|
|
uint64_t Address;
|
2014-01-25 00:32:01 +00:00
|
|
|
if (error(SI->getAddress(Address)))
|
|
|
|
break;
|
|
|
|
if (Address == UnknownAddressOrSize)
|
|
|
|
continue;
|
2012-02-03 04:13:37 +00:00
|
|
|
Address -= SectionAddr;
|
2014-02-24 22:12:11 +00:00
|
|
|
if (Address >= SectSize)
|
|
|
|
continue;
|
2012-02-03 04:13:37 +00:00
|
|
|
|
2011-07-15 18:39:24 +00:00
|
|
|
StringRef Name;
|
2014-01-25 00:32:01 +00:00
|
|
|
if (error(SI->getName(Name)))
|
|
|
|
break;
|
2011-07-15 18:39:24 +00:00
|
|
|
Symbols.push_back(std::make_pair(Address, Name));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the symbols by address, just in case they didn't come in that way.
|
|
|
|
array_pod_sort(Symbols.begin(), Symbols.end());
|
|
|
|
|
2011-10-13 22:17:18 +00:00
|
|
|
// Make a list of all the relocations for this section.
|
|
|
|
std::vector<RelocationRef> Rels;
|
|
|
|
if (InlineRelocs) {
|
2014-03-13 14:37:36 +00:00
|
|
|
SmallVectorImpl<SectionRef> *RelocSecs = &SectionRelocMap[Section];
|
2014-01-25 17:38:19 +00:00
|
|
|
for (SmallVectorImpl<SectionRef>::iterator RelocSec = RelocSecs->begin(),
|
|
|
|
E = RelocSecs->end();
|
|
|
|
RelocSec != E; ++RelocSec) {
|
2014-02-10 20:24:04 +00:00
|
|
|
for (relocation_iterator RI = RelocSec->relocation_begin(),
|
|
|
|
RE = RelocSec->relocation_end();
|
2014-01-30 02:49:50 +00:00
|
|
|
RI != RE; ++RI)
|
2014-01-25 17:38:19 +00:00
|
|
|
Rels.push_back(*RI);
|
2011-10-13 22:17:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort relocations by address.
|
|
|
|
std::sort(Rels.begin(), Rels.end(), RelocAddressLess);
|
|
|
|
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
StringRef SegmentName = "";
|
2014-01-25 00:32:01 +00:00
|
|
|
if (const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(Obj)) {
|
2014-03-13 14:37:36 +00:00
|
|
|
DataRefImpl DR = Section.getRawDataRefImpl();
|
2013-04-18 18:08:55 +00:00
|
|
|
SegmentName = MachO->getSectionFinalSegmentName(DR);
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
}
|
2011-06-25 17:55:23 +00:00
|
|
|
StringRef name;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getName(name)))
|
2014-01-25 00:32:01 +00:00
|
|
|
break;
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
outs() << "Disassembly of section ";
|
|
|
|
if (!SegmentName.empty())
|
|
|
|
outs() << SegmentName << ",";
|
|
|
|
outs() << name << ':';
|
2011-07-15 18:39:24 +00:00
|
|
|
|
|
|
|
// If the section has no symbols just insert a dummy one and disassemble
|
|
|
|
// the whole section.
|
|
|
|
if (Symbols.empty())
|
|
|
|
Symbols.push_back(std::make_pair(0, name));
|
2011-01-20 06:39:06 +00:00
|
|
|
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
|
|
|
|
SmallString<40> Comments;
|
|
|
|
raw_svector_ostream CommentStream(Comments);
|
|
|
|
|
2011-06-25 17:55:23 +00:00
|
|
|
StringRef Bytes;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getContents(Bytes)))
|
2014-01-25 00:32:01 +00:00
|
|
|
break;
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
StringRefMemoryObject memoryObject(Bytes, SectionAddr);
|
2011-01-20 06:39:06 +00:00
|
|
|
uint64_t Size;
|
|
|
|
uint64_t Index;
|
2011-07-15 18:39:24 +00:00
|
|
|
|
2011-10-13 22:17:18 +00:00
|
|
|
std::vector<RelocationRef>::const_iterator rel_cur = Rels.begin();
|
|
|
|
std::vector<RelocationRef>::const_iterator rel_end = Rels.end();
|
2011-07-15 18:39:24 +00:00
|
|
|
// Disassemble symbol by symbol.
|
|
|
|
for (unsigned si = 0, se = Symbols.size(); si != se; ++si) {
|
|
|
|
uint64_t Start = Symbols[si].first;
|
2011-10-13 20:37:08 +00:00
|
|
|
uint64_t End;
|
|
|
|
// The end is either the size of the section or the beginning of the next
|
|
|
|
// symbol.
|
|
|
|
if (si == se - 1)
|
|
|
|
End = SectSize;
|
|
|
|
// Make sure this symbol takes up space.
|
|
|
|
else if (Symbols[si + 1].first != Start)
|
|
|
|
End = Symbols[si + 1].first - 1;
|
|
|
|
else
|
|
|
|
// This symbol has the same address as the next symbol. Skip it.
|
|
|
|
continue;
|
|
|
|
|
2011-07-15 18:39:24 +00:00
|
|
|
outs() << '\n' << Symbols[si].second << ":\n";
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
2014-01-25 00:32:01 +00:00
|
|
|
raw_ostream &DebugOut = DebugFlag ? dbgs() : nulls();
|
2011-07-15 18:39:24 +00:00
|
|
|
#else
|
2014-01-25 00:32:01 +00:00
|
|
|
raw_ostream &DebugOut = nulls();
|
2011-07-15 18:39:24 +00:00
|
|
|
#endif
|
|
|
|
|
2011-09-19 17:56:04 +00:00
|
|
|
for (Index = Start; Index < End; Index += Size) {
|
|
|
|
MCInst Inst;
|
|
|
|
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
if (DisAsm->getInstruction(Inst, Size, memoryObject,
|
|
|
|
SectionAddr + Index,
|
|
|
|
DebugOut, CommentStream)) {
|
2012-11-20 22:57:02 +00:00
|
|
|
outs() << format("%8" PRIx64 ":", SectionAddr + Index);
|
|
|
|
if (!NoShowRawInsn) {
|
|
|
|
outs() << "\t";
|
|
|
|
DumpBytes(StringRef(Bytes.data() + Index, Size));
|
|
|
|
}
|
2011-09-19 17:56:04 +00:00
|
|
|
IP->printInst(&Inst, outs(), "");
|
Add MCSymbolizer for symbolic/annotated disassembly.
This is a basic first step towards symbolization of disassembled
instructions. This used to be done using externally provided (C API)
callbacks. This patch introduces:
- the MCSymbolizer class, that mimics the same functions that were used
in the X86 and ARM disassemblers to symbolize immediate operands and
to annotate loads based off PC (for things like c string literals).
- the MCExternalSymbolizer class, which implements the old C API.
- the MCRelocationInfo class, which provides a way for targets to
translate relocations (either object::RelocationRef, or disassembler
C API VariantKinds) to MCExprs.
- the MCObjectSymbolizer class, which does symbolization using what it
finds in an object::ObjectFile. This makes simple symbolization (with
no fancy relocation stuff) work for all object formats!
- x86-64 Mach-O and ELF MCRelocationInfos.
- A basic ARM Mach-O MCRelocationInfo, that provides just enough to
support the C API VariantKinds.
Most of what works in otool (the only user of the old symbolization API
that I know of) for x86-64 symbolic disassembly (-tvV) works, namely:
- symbol references: call _foo; jmp 15 <_foo+50>
- relocations: call _foo-_bar; call _foo-4
- __cf?string: leaq 193(%rip), %rax ## literal pool for "hello"
Stub support is the main missing part (because libObject doesn't know,
among other things, about mach-o indirect symbols).
As for the MCSymbolizer API, instead of relying on the disassemblers
to call the tryAdding* methods, maybe this could be done automagically
using InstrInfo? For instance, even though PC-relative LEAs are used
to get the address of string literals in a typical Mach-O file, a MOV
would be used in an ELF file. And right now, the explicit symbolization
only recognizes PC-relative LEAs. InstrInfo should have already have
most of what is needed to know what to symbolize, so this can
definitely be improved.
I'd also like to remove object::RelocationRef::getValueString (it seems
only used by relocation printing in objdump), as simply printing the
created MCExpr is definitely enough (and cleaner than string concats).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182625 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-24 00:39:57 +00:00
|
|
|
outs() << CommentStream.str();
|
|
|
|
Comments.clear();
|
2011-09-19 17:56:04 +00:00
|
|
|
outs() << "\n";
|
|
|
|
} else {
|
|
|
|
errs() << ToolName << ": warning: invalid instruction encoding\n";
|
|
|
|
if (Size == 0)
|
|
|
|
Size = 1; // skip illegible bytes
|
2011-07-20 19:37:35 +00:00
|
|
|
}
|
2011-10-13 22:17:18 +00:00
|
|
|
|
|
|
|
// Print relocation for instruction.
|
|
|
|
while (rel_cur != rel_end) {
|
2011-10-25 20:35:53 +00:00
|
|
|
bool hidden = false;
|
2011-10-13 22:17:18 +00:00
|
|
|
uint64_t addr;
|
|
|
|
SmallString<16> name;
|
|
|
|
SmallString<32> val;
|
2011-10-25 20:35:53 +00:00
|
|
|
|
|
|
|
// If this relocation is hidden, skip it.
|
|
|
|
if (error(rel_cur->getHidden(hidden))) goto skip_print_rel;
|
|
|
|
if (hidden) goto skip_print_rel;
|
|
|
|
|
2013-04-25 12:28:45 +00:00
|
|
|
if (error(rel_cur->getOffset(addr))) goto skip_print_rel;
|
2011-10-13 22:17:18 +00:00
|
|
|
// Stop when rel_cur's address is past the current instruction.
|
2011-10-25 20:15:39 +00:00
|
|
|
if (addr >= Index + Size) break;
|
2011-10-13 22:17:18 +00:00
|
|
|
if (error(rel_cur->getTypeName(name))) goto skip_print_rel;
|
|
|
|
if (error(rel_cur->getValueString(val))) goto skip_print_rel;
|
|
|
|
|
2012-03-10 02:04:38 +00:00
|
|
|
outs() << format("\t\t\t%8" PRIx64 ": ", SectionAddr + addr) << name
|
|
|
|
<< "\t" << val << "\n";
|
2011-10-13 22:17:18 +00:00
|
|
|
|
|
|
|
skip_print_rel:
|
|
|
|
++rel_cur;
|
|
|
|
}
|
2011-07-20 19:37:35 +00:00
|
|
|
}
|
2011-01-20 06:39:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-13 14:37:36 +00:00
|
|
|
static void PrintRelocations(const ObjectFile *Obj) {
|
|
|
|
for (const SectionRef &Section : Obj->sections()) {
|
|
|
|
if (Section.relocation_begin() == Section.relocation_end())
|
2011-10-08 00:18:30 +00:00
|
|
|
continue;
|
|
|
|
StringRef secname;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getName(secname)))
|
|
|
|
continue;
|
2011-10-08 00:18:30 +00:00
|
|
|
outs() << "RELOCATION RECORDS FOR [" << secname << "]:\n";
|
2014-03-13 14:37:36 +00:00
|
|
|
for (relocation_iterator ri = Section.relocation_begin(),
|
|
|
|
re = Section.relocation_end();
|
2014-01-30 02:49:50 +00:00
|
|
|
ri != re; ++ri) {
|
2011-10-25 20:35:53 +00:00
|
|
|
bool hidden;
|
2011-10-08 00:18:30 +00:00
|
|
|
uint64_t address;
|
|
|
|
SmallString<32> relocname;
|
|
|
|
SmallString<32> valuestr;
|
2011-10-25 20:35:53 +00:00
|
|
|
if (error(ri->getHidden(hidden))) continue;
|
|
|
|
if (hidden) continue;
|
2011-10-08 00:18:30 +00:00
|
|
|
if (error(ri->getTypeName(relocname))) continue;
|
2013-04-25 12:28:45 +00:00
|
|
|
if (error(ri->getOffset(address))) continue;
|
2011-10-08 00:18:30 +00:00
|
|
|
if (error(ri->getValueString(valuestr))) continue;
|
|
|
|
outs() << address << " " << relocname << " " << valuestr << "\n";
|
|
|
|
}
|
|
|
|
outs() << "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-13 14:37:36 +00:00
|
|
|
static void PrintSectionHeaders(const ObjectFile *Obj) {
|
2011-10-10 21:21:34 +00:00
|
|
|
outs() << "Sections:\n"
|
|
|
|
"Idx Name Size Address Type\n";
|
|
|
|
unsigned i = 0;
|
2014-03-13 14:37:36 +00:00
|
|
|
for (const SectionRef &Section : Obj->sections()) {
|
2011-10-10 21:21:34 +00:00
|
|
|
StringRef Name;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getName(Name)))
|
2014-01-25 00:32:01 +00:00
|
|
|
return;
|
2011-10-10 21:21:34 +00:00
|
|
|
uint64_t Address;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getAddress(Address)))
|
|
|
|
return;
|
2011-10-10 21:21:34 +00:00
|
|
|
uint64_t Size;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getSize(Size)))
|
|
|
|
return;
|
2011-10-10 21:21:34 +00:00
|
|
|
bool Text, Data, BSS;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.isText(Text)))
|
|
|
|
return;
|
|
|
|
if (error(Section.isData(Data)))
|
|
|
|
return;
|
|
|
|
if (error(Section.isBSS(BSS)))
|
|
|
|
return;
|
2011-10-10 21:21:34 +00:00
|
|
|
std::string Type = (std::string(Text ? "TEXT " : "") +
|
2011-10-13 20:37:20 +00:00
|
|
|
(Data ? "DATA " : "") + (BSS ? "BSS" : ""));
|
2014-03-13 14:37:36 +00:00
|
|
|
outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n", i,
|
|
|
|
Name.str().c_str(), Size, Address, Type.c_str());
|
2011-10-10 21:21:34 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-13 14:37:36 +00:00
|
|
|
static void PrintSectionContents(const ObjectFile *Obj) {
|
2014-01-25 00:32:01 +00:00
|
|
|
error_code EC;
|
2014-03-13 14:37:36 +00:00
|
|
|
for (const SectionRef &Section : Obj->sections()) {
|
2011-10-17 17:13:22 +00:00
|
|
|
StringRef Name;
|
|
|
|
StringRef Contents;
|
|
|
|
uint64_t BaseAddr;
|
2013-04-16 10:53:11 +00:00
|
|
|
bool BSS;
|
2014-03-13 14:37:36 +00:00
|
|
|
if (error(Section.getName(Name)))
|
|
|
|
continue;
|
|
|
|
if (error(Section.getContents(Contents)))
|
|
|
|
continue;
|
|
|
|
if (error(Section.getAddress(BaseAddr)))
|
|
|
|
continue;
|
|
|
|
if (error(Section.isBSS(BSS)))
|
|
|
|
continue;
|
2011-10-17 17:13:22 +00:00
|
|
|
|
|
|
|
outs() << "Contents of section " << Name << ":\n";
|
2013-04-16 10:53:11 +00:00
|
|
|
if (BSS) {
|
|
|
|
outs() << format("<skipping contents of bss section at [%04" PRIx64
|
|
|
|
", %04" PRIx64 ")>\n", BaseAddr,
|
|
|
|
BaseAddr + Contents.size());
|
|
|
|
continue;
|
|
|
|
}
|
2011-10-17 17:13:22 +00:00
|
|
|
|
|
|
|
// Dump out the content as hex and printable ascii characters.
|
|
|
|
for (std::size_t addr = 0, end = Contents.size(); addr < end; addr += 16) {
|
2012-03-10 02:04:38 +00:00
|
|
|
outs() << format(" %04" PRIx64 " ", BaseAddr + addr);
|
2011-10-17 17:13:22 +00:00
|
|
|
// Dump line of hex.
|
|
|
|
for (std::size_t i = 0; i < 16; ++i) {
|
|
|
|
if (i != 0 && i % 4 == 0)
|
|
|
|
outs() << ' ';
|
|
|
|
if (addr + i < end)
|
|
|
|
outs() << hexdigit((Contents[addr + i] >> 4) & 0xF, true)
|
|
|
|
<< hexdigit(Contents[addr + i] & 0xF, true);
|
|
|
|
else
|
|
|
|
outs() << " ";
|
|
|
|
}
|
|
|
|
// Print ascii.
|
|
|
|
outs() << " ";
|
|
|
|
for (std::size_t i = 0; i < 16 && addr + i < end; ++i) {
|
2013-02-12 21:21:59 +00:00
|
|
|
if (std::isprint(static_cast<unsigned char>(Contents[addr + i]) & 0xFF))
|
2011-10-17 17:13:22 +00:00
|
|
|
outs() << Contents[addr + i];
|
|
|
|
else
|
|
|
|
outs() << ".";
|
|
|
|
}
|
|
|
|
outs() << "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-18 19:32:17 +00:00
|
|
|
static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
|
|
|
|
const coff_file_header *header;
|
|
|
|
if (error(coff->getHeader(header))) return;
|
|
|
|
int aux_count = 0;
|
|
|
|
const coff_symbol *symbol = 0;
|
|
|
|
for (int i = 0, e = header->NumberOfSymbols; i != e; ++i) {
|
|
|
|
if (aux_count--) {
|
|
|
|
// Figure out which type of aux this is.
|
|
|
|
if (symbol->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC
|
|
|
|
&& symbol->Value == 0) { // Section definition.
|
|
|
|
const coff_aux_section_definition *asd;
|
|
|
|
if (error(coff->getAuxSymbol<coff_aux_section_definition>(i, asd)))
|
|
|
|
return;
|
|
|
|
outs() << "AUX "
|
|
|
|
<< format("scnlen 0x%x nreloc %d nlnno %d checksum 0x%x "
|
|
|
|
, unsigned(asd->Length)
|
|
|
|
, unsigned(asd->NumberOfRelocations)
|
|
|
|
, unsigned(asd->NumberOfLinenumbers)
|
|
|
|
, unsigned(asd->CheckSum))
|
|
|
|
<< format("assoc %d comdat %d\n"
|
|
|
|
, unsigned(asd->Number)
|
|
|
|
, unsigned(asd->Selection));
|
2012-08-07 17:53:14 +00:00
|
|
|
} else
|
2011-10-18 19:32:17 +00:00
|
|
|
outs() << "AUX Unknown\n";
|
|
|
|
} else {
|
|
|
|
StringRef name;
|
|
|
|
if (error(coff->getSymbol(i, symbol))) return;
|
|
|
|
if (error(coff->getSymbolName(symbol, name))) return;
|
|
|
|
outs() << "[" << format("%2d", i) << "]"
|
|
|
|
<< "(sec " << format("%2d", int(symbol->SectionNumber)) << ")"
|
|
|
|
<< "(fl 0x00)" // Flag bits, which COFF doesn't have.
|
|
|
|
<< "(ty " << format("%3x", unsigned(symbol->Type)) << ")"
|
|
|
|
<< "(scl " << format("%3x", unsigned(symbol->StorageClass)) << ") "
|
|
|
|
<< "(nx " << unsigned(symbol->NumberOfAuxSymbols) << ") "
|
|
|
|
<< "0x" << format("%08x", unsigned(symbol->Value)) << " "
|
|
|
|
<< name << "\n";
|
|
|
|
aux_count = symbol->NumberOfAuxSymbols;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void PrintSymbolTable(const ObjectFile *o) {
|
|
|
|
outs() << "SYMBOL TABLE:\n";
|
|
|
|
|
|
|
|
if (const COFFObjectFile *coff = dyn_cast<const COFFObjectFile>(o))
|
|
|
|
PrintCOFFSymbolTable(coff);
|
|
|
|
else {
|
2014-02-10 20:24:04 +00:00
|
|
|
for (symbol_iterator si = o->symbol_begin(), se = o->symbol_end();
|
2014-01-30 02:49:50 +00:00
|
|
|
si != se; ++si) {
|
2011-10-18 19:32:17 +00:00
|
|
|
StringRef Name;
|
2011-11-29 17:40:10 +00:00
|
|
|
uint64_t Address;
|
2011-10-18 19:32:17 +00:00
|
|
|
SymbolRef::Type Type;
|
|
|
|
uint64_t Size;
|
2014-01-31 20:57:12 +00:00
|
|
|
uint32_t Flags = si->getFlags();
|
2014-02-10 20:24:04 +00:00
|
|
|
section_iterator Section = o->section_end();
|
2011-10-18 19:32:17 +00:00
|
|
|
if (error(si->getName(Name))) continue;
|
2011-11-29 17:40:10 +00:00
|
|
|
if (error(si->getAddress(Address))) continue;
|
2011-10-18 19:32:17 +00:00
|
|
|
if (error(si->getType(Type))) continue;
|
|
|
|
if (error(si->getSize(Size))) continue;
|
|
|
|
if (error(si->getSection(Section))) continue;
|
|
|
|
|
2012-02-28 23:47:53 +00:00
|
|
|
bool Global = Flags & SymbolRef::SF_Global;
|
|
|
|
bool Weak = Flags & SymbolRef::SF_Weak;
|
|
|
|
bool Absolute = Flags & SymbolRef::SF_Absolute;
|
|
|
|
|
2011-11-29 17:40:10 +00:00
|
|
|
if (Address == UnknownAddressOrSize)
|
|
|
|
Address = 0;
|
|
|
|
if (Size == UnknownAddressOrSize)
|
|
|
|
Size = 0;
|
2011-10-18 19:32:17 +00:00
|
|
|
char GlobLoc = ' ';
|
2012-02-29 02:11:55 +00:00
|
|
|
if (Type != SymbolRef::ST_Unknown)
|
2011-10-18 19:32:17 +00:00
|
|
|
GlobLoc = Global ? 'g' : 'l';
|
|
|
|
char Debug = (Type == SymbolRef::ST_Debug || Type == SymbolRef::ST_File)
|
|
|
|
? 'd' : ' ';
|
|
|
|
char FileFunc = ' ';
|
|
|
|
if (Type == SymbolRef::ST_File)
|
|
|
|
FileFunc = 'f';
|
|
|
|
else if (Type == SymbolRef::ST_Function)
|
|
|
|
FileFunc = 'F';
|
|
|
|
|
2013-01-10 22:40:50 +00:00
|
|
|
const char *Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 :
|
|
|
|
"%08" PRIx64;
|
|
|
|
|
|
|
|
outs() << format(Fmt, Address) << " "
|
2011-10-18 19:32:17 +00:00
|
|
|
<< GlobLoc // Local -> 'l', Global -> 'g', Neither -> ' '
|
|
|
|
<< (Weak ? 'w' : ' ') // Weak?
|
|
|
|
<< ' ' // Constructor. Not supported yet.
|
|
|
|
<< ' ' // Warning. Not supported yet.
|
|
|
|
<< ' ' // Indirect reference to another symbol.
|
|
|
|
<< Debug // Debugging (d) or dynamic (D) symbol.
|
|
|
|
<< FileFunc // Name of function (F), file (f) or object (O).
|
|
|
|
<< ' ';
|
|
|
|
if (Absolute)
|
|
|
|
outs() << "*ABS*";
|
2014-02-10 20:24:04 +00:00
|
|
|
else if (Section == o->section_end())
|
2011-10-18 19:32:17 +00:00
|
|
|
outs() << "*UND*";
|
|
|
|
else {
|
2013-04-18 18:08:55 +00:00
|
|
|
if (const MachOObjectFile *MachO =
|
|
|
|
dyn_cast<const MachOObjectFile>(o)) {
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
DataRefImpl DR = Section->getRawDataRefImpl();
|
2013-04-18 18:08:55 +00:00
|
|
|
StringRef SegmentName = MachO->getSectionFinalSegmentName(DR);
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
outs() << SegmentName << ",";
|
|
|
|
}
|
2011-10-18 19:32:17 +00:00
|
|
|
StringRef SectionName;
|
|
|
|
if (error(Section->getName(SectionName)))
|
|
|
|
SectionName = "";
|
|
|
|
outs() << SectionName;
|
|
|
|
}
|
|
|
|
outs() << '\t'
|
2012-03-10 02:04:38 +00:00
|
|
|
<< format("%08" PRIx64 " ", Size)
|
2011-10-18 19:32:17 +00:00
|
|
|
<< Name
|
|
|
|
<< '\n';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-05 20:12:35 +00:00
|
|
|
static void PrintUnwindInfo(const ObjectFile *o) {
|
|
|
|
outs() << "Unwind info:\n\n";
|
|
|
|
|
|
|
|
if (const COFFObjectFile *coff = dyn_cast<COFFObjectFile>(o)) {
|
|
|
|
printCOFFUnwindInfo(coff);
|
|
|
|
} else {
|
|
|
|
// TODO: Extract DWARF dump tool to objdump.
|
|
|
|
errs() << "This operation is only currently supported "
|
|
|
|
"for COFF object files.\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-27 21:04:00 +00:00
|
|
|
static void printPrivateFileHeader(const ObjectFile *o) {
|
|
|
|
if (o->isELF()) {
|
|
|
|
printELFFileHeader(o);
|
|
|
|
} else if (o->isCOFF()) {
|
|
|
|
printCOFFFileHeader(o);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-08 00:18:30 +00:00
|
|
|
static void DumpObject(const ObjectFile *o) {
|
2011-10-17 17:13:22 +00:00
|
|
|
outs() << '\n';
|
|
|
|
outs() << o->getFileName()
|
|
|
|
<< ":\tfile format " << o->getFileFormatName() << "\n\n";
|
|
|
|
|
2011-10-08 00:18:30 +00:00
|
|
|
if (Disassemble)
|
2011-10-13 22:17:18 +00:00
|
|
|
DisassembleObject(o, Relocations);
|
|
|
|
if (Relocations && !Disassemble)
|
2011-10-08 00:18:30 +00:00
|
|
|
PrintRelocations(o);
|
2011-10-10 21:21:34 +00:00
|
|
|
if (SectionHeaders)
|
|
|
|
PrintSectionHeaders(o);
|
2011-10-17 17:13:22 +00:00
|
|
|
if (SectionContents)
|
|
|
|
PrintSectionContents(o);
|
2011-10-18 19:32:17 +00:00
|
|
|
if (SymbolTable)
|
|
|
|
PrintSymbolTable(o);
|
2012-12-05 20:12:35 +00:00
|
|
|
if (UnwindInfo)
|
|
|
|
PrintUnwindInfo(o);
|
2013-09-27 21:04:00 +00:00
|
|
|
if (PrivateHeaders)
|
|
|
|
printPrivateFileHeader(o);
|
2011-10-08 00:18:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// @brief Dump each object file in \a a;
|
|
|
|
static void DumpArchive(const Archive *a) {
|
2014-01-25 00:32:01 +00:00
|
|
|
for (Archive::child_iterator i = a->child_begin(), e = a->child_end(); i != e;
|
|
|
|
++i) {
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<Binary> child;
|
2014-01-25 00:32:01 +00:00
|
|
|
if (error_code EC = i->getAsBinary(child)) {
|
2011-11-16 01:24:41 +00:00
|
|
|
// Ignore non-object files.
|
2014-01-25 00:32:01 +00:00
|
|
|
if (EC != object_error::invalid_file_type)
|
|
|
|
errs() << ToolName << ": '" << a->getFileName() << "': " << EC.message()
|
2011-11-16 01:24:41 +00:00
|
|
|
<< ".\n";
|
2011-10-08 00:18:30 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (ObjectFile *o = dyn_cast<ObjectFile>(child.get()))
|
|
|
|
DumpObject(o);
|
|
|
|
else
|
|
|
|
errs() << ToolName << ": '" << a->getFileName() << "': "
|
|
|
|
<< "Unrecognized file type.\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// @brief Open file and figure out how to dump it.
|
|
|
|
static void DumpInput(StringRef file) {
|
|
|
|
// If file isn't stdin, check that it exists.
|
|
|
|
if (file != "-" && !sys::fs::exists(file)) {
|
|
|
|
errs() << ToolName << ": '" << file << "': " << "No such file\n";
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Add a function to get the segment name of a section.
On MachO, sections also have segment names. When a tool looking at a .o file
prints a segment name, this is what they mean. In reality, a .o has only one
anonymous, segment.
This patch adds a MachO only function to fetch that segment name. I named it
getSectionFinalSegmentName since the main use for the name seems to be inform
the linker with segment this section should go to.
The patch also changes MachOObjectFile::getSectionName to return just the
section name instead of computing SegmentName,SectionName.
The main difference from the previous patch is that it doesn't use
InMemoryStruct. It is extremely dangerous: if the endians match it returns
a pointer to the file buffer, if not, it returns a pointer to an internal buffer
that is overwritten in the next API call.
We should change all of this code to use
support::detail::packed_endian_specific_integral like ELF, but since these
functions only handle strings, they work with big and little endian machines
as is.
I have tested this by installing ubuntu 12.10 ppc on qemu, that is why it took
so long :-)
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170838 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-21 03:47:03 +00:00
|
|
|
if (MachOOpt && Disassemble) {
|
2011-10-08 00:18:30 +00:00
|
|
|
DisassembleInputMachO(file);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Attempt to open the binary.
|
2014-01-15 19:37:43 +00:00
|
|
|
ErrorOr<Binary *> BinaryOrErr = createBinary(file);
|
|
|
|
if (error_code EC = BinaryOrErr.getError()) {
|
|
|
|
errs() << ToolName << ": '" << file << "': " << EC.message() << ".\n";
|
2011-10-08 00:18:30 +00:00
|
|
|
return;
|
|
|
|
}
|
2014-03-06 05:51:42 +00:00
|
|
|
std::unique_ptr<Binary> binary(BinaryOrErr.get());
|
2011-10-08 00:18:30 +00:00
|
|
|
|
2012-08-07 17:53:14 +00:00
|
|
|
if (Archive *a = dyn_cast<Archive>(binary.get()))
|
2011-10-08 00:18:30 +00:00
|
|
|
DumpArchive(a);
|
2012-08-07 17:53:14 +00:00
|
|
|
else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get()))
|
2011-10-08 00:18:30 +00:00
|
|
|
DumpObject(o);
|
2012-08-07 17:53:14 +00:00
|
|
|
else
|
2011-10-08 00:18:30 +00:00
|
|
|
errs() << ToolName << ": '" << file << "': " << "Unrecognized file type.\n";
|
|
|
|
}
|
|
|
|
|
2011-01-20 06:39:06 +00:00
|
|
|
int main(int argc, char **argv) {
|
|
|
|
// Print a stack trace if we signal out.
|
|
|
|
sys::PrintStackTraceOnErrorSignal();
|
|
|
|
PrettyStackTraceProgram X(argc, argv);
|
|
|
|
llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
|
|
|
|
|
|
|
|
// Initialize targets and assembly printers/parsers.
|
|
|
|
llvm::InitializeAllTargetInfos();
|
2011-07-22 21:58:54 +00:00
|
|
|
llvm::InitializeAllTargetMCs();
|
2011-01-20 06:39:06 +00:00
|
|
|
llvm::InitializeAllAsmParsers();
|
|
|
|
llvm::InitializeAllDisassemblers();
|
|
|
|
|
2012-05-03 23:20:10 +00:00
|
|
|
// Register the target printer for --version.
|
|
|
|
cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
|
|
|
|
|
2011-01-20 06:39:06 +00:00
|
|
|
cl::ParseCommandLineOptions(argc, argv, "llvm object file dumper\n");
|
|
|
|
TripleName = Triple::normalize(TripleName);
|
|
|
|
|
|
|
|
ToolName = argv[0];
|
|
|
|
|
|
|
|
// Defaults to a.out if no filenames specified.
|
|
|
|
if (InputFilenames.size() == 0)
|
|
|
|
InputFilenames.push_back("a.out");
|
|
|
|
|
2011-10-18 19:32:17 +00:00
|
|
|
if (!Disassemble
|
|
|
|
&& !Relocations
|
|
|
|
&& !SectionHeaders
|
|
|
|
&& !SectionContents
|
2012-12-05 20:12:35 +00:00
|
|
|
&& !SymbolTable
|
2013-01-06 03:56:49 +00:00
|
|
|
&& !UnwindInfo
|
|
|
|
&& !PrivateHeaders) {
|
2011-01-20 06:39:06 +00:00
|
|
|
cl::PrintHelpMessage();
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
2011-10-08 00:18:30 +00:00
|
|
|
std::for_each(InputFilenames.begin(), InputFilenames.end(),
|
|
|
|
DumpInput);
|
2011-01-20 06:39:06 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|