From e793862979e0222682356c280e9b63c65784202b Mon Sep 17 00:00:00 2001 From: Kevin Enderby Date: Wed, 24 Sep 2014 23:08:22 +0000 Subject: [PATCH] =?UTF-8?q?Flush=20out=20enough=20of=20llvm-objdump?= =?UTF-8?q?=E2=80=99s=20SymbolizerSymbolLookUp()=20for=20Mach-O=20files=20?= =?UTF-8?q?to=20get=20the=20literal=20string=20=E2=80=9CHello=20world?= =?UTF-8?q?=E2=80=9D=20printed=20as=20a=20comment=20on=20the=20instruction?= =?UTF-8?q?=20that=20loads=20the=20pointer=20to=20it.=20For=20now=20this?= =?UTF-8?q?=20is=20just=20for=20x86=5F64.=20So=20for=20object=20files=20wi?= =?UTF-8?q?th=20relocation=20entries=20it=20produces=20things=20like:?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit leaq L_.str(%rip), %rax ## literal pool for: "Hello world\n" and similar for fully linked images like executables: leaq 0x4f(%rip), %rax ## literal pool for: "Hello world\n" Also to allow testing against darwin’s otool(1), I hooked up the existing -no-show-raw-insn option to the Mach-O parser code, added the new Mach-O only -full-leading-addr option to match otool(1)'s printing of addresses and also added the new -print-imm-hex option. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218423 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/macho-symbolized-disassembly.test | 9 +- tools/llvm-objdump/MachODump.cpp | 333 ++++++++++++++++-- tools/llvm-objdump/llvm-objdump.cpp | 7 +- tools/llvm-objdump/llvm-objdump.h | 1 + 4 files changed, 312 insertions(+), 38 deletions(-) diff --git a/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test b/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test index 23b54563078..01105fbf3fd 100644 --- a/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test +++ b/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test @@ -1,4 +1,7 @@ -// RUN: llvm-objdump -d -m %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s +// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s -check-prefix=OBJ +// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-x86_64 | FileCheck %s -check-prefix=EXE -CHECK: leaq L_.str(%rip), %rax -CHECK: callq _printf +OBJ: 0000000000000008 leaq L_.str(%rip), %rax ## literal pool for: "Hello world\n" +OBJ: 0000000000000026 callq _printf + +EXE: 0000000100000f38 leaq 0x4f(%rip), %rax ## literal pool for: "Hello world\n" diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp index 378eacb256a..9415ead09b0 100644 --- a/tools/llvm-objdump/MachODump.cpp +++ b/tools/llvm-objdump/MachODump.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/MachO.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/raw_ostream.h" @@ -51,6 +52,14 @@ static cl::opt static cl::opt DSYMFile("dsym", cl::desc("Use .dSYM file for debug info")); +static cl::opt + FullLeadingAddr("full-leading-addr", + cl::desc("Print full leading address")); + +static cl::opt + PrintImmHex("print-imm-hex", + cl::desc("Use hex format for immediate values")); + static std::string ThumbTripleName; static const Target *GetTarget(const MachOObjectFile *MachOObj, @@ -225,11 +234,14 @@ void llvm::DisassembleInputMachO(StringRef Filename) { DisassembleInputMachO2(Filename, MachOOF.get()); } +typedef DenseMap SymbolAddressMap; + // The block of info used by the Symbolizer call backs. struct DisassembleInfo { bool verbose; MachOObjectFile *O; SectionRef S; + SymbolAddressMap *AddrMap; }; // SymbolizerGetOpInfo() is the operand information call back function. @@ -301,7 +313,6 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset, // is the offset from the external symbol. if (info->O->getAnyRelocationPCRel(RE)) op_info->Value -= Pc + Offset + Size; - // SymbolRef Symbol = (*info->Relocs)[Idx].second; StringRef SymName; Symbol.getName(SymName); const char *name = SymName.data(); @@ -343,8 +354,142 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset, } } +// GuessCstringPointer is passed the address of what might be a pointer to a +// literal string in a cstring section. If that address is in a cstring section +// it returns a pointer to that string. Else it returns nullptr. +const char *GuessCstringPointer(uint64_t ReferenceValue, + struct DisassembleInfo *info) { + uint32_t LoadCommandCount = info->O->getHeader().ncmds; + MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo(); + for (unsigned I = 0;; ++I) { + if (Load.C.cmd == MachO::LC_SEGMENT_64) { + MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load); + for (unsigned J = 0; J < Seg.nsects; ++J) { + MachO::section_64 Sec = info->O->getSection64(Load, J); + uint32_t section_type = Sec.flags & MachO::SECTION_TYPE; + if (section_type == MachO::S_CSTRING_LITERALS && + ReferenceValue >= Sec.addr && + ReferenceValue < Sec.addr + Sec.size) { + uint64_t sect_offset = ReferenceValue - Sec.addr; + uint64_t object_offset = Sec.offset + sect_offset; + StringRef MachOContents = info->O->getData(); + uint64_t object_size = MachOContents.size(); + const char *object_addr = (const char *)MachOContents.data(); + if (object_offset < object_size) { + const char *name = object_addr + object_offset; + return name; + } else { + return nullptr; + } + } + } + } else if (Load.C.cmd == MachO::LC_SEGMENT) { + MachO::segment_command Seg = info->O->getSegmentLoadCommand(Load); + for (unsigned J = 0; J < Seg.nsects; ++J) { + MachO::section Sec = info->O->getSection(Load, J); + uint32_t section_type = Sec.flags & MachO::SECTION_TYPE; + if (section_type == MachO::S_CSTRING_LITERALS && + ReferenceValue >= Sec.addr && + ReferenceValue < Sec.addr + Sec.size) { + uint64_t sect_offset = ReferenceValue - Sec.addr; + uint64_t object_offset = Sec.offset + sect_offset; + StringRef MachOContents = info->O->getData(); + uint64_t object_size = MachOContents.size(); + const char *object_addr = (const char *)MachOContents.data(); + if (object_offset < object_size) { + const char *name = object_addr + object_offset; + return name; + } else { + return nullptr; + } + } + } + } + if (I == LoadCommandCount - 1) + break; + else + Load = info->O->getNextLoadCommandInfo(Load); + } + return nullptr; +} + +// GuessLiteralPointer returns a string which for the item in the Mach-O file +// for the address passed in as ReferenceValue for printing as a comment with +// the instruction and also returns the corresponding type of that item +// indirectly through ReferenceType. +// +// If ReferenceValue is an address of literal cstring then a pointer to the +// cstring is returned and ReferenceType is set to +// LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr . +// +// TODO: other literals such as Objective-C CFStrings refs, Selector refs, +// Message refs, Class refs and a Symbol address in a literal pool are yet +// to be done here. +const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC, + uint64_t *ReferenceType, + struct DisassembleInfo *info) { + // TODO: This rouine's code is only for an x86_64 Mach-O file for now. + unsigned int Arch = info->O->getArch(); + if (Arch != Triple::x86_64) + return nullptr; + + // First see if there is an external relocation entry at the ReferencePC. + uint64_t sect_addr; + info->S.getAddress(sect_addr); + uint64_t sect_offset = ReferencePC - sect_addr; + bool reloc_found = false; + DataRefImpl Rel; + MachO::any_relocation_info RE; + bool isExtern = false; + SymbolRef Symbol; + for (const RelocationRef &Reloc : info->S.relocations()) { + uint64_t RelocOffset; + Reloc.getOffset(RelocOffset); + if (RelocOffset == sect_offset) { + Rel = Reloc.getRawDataRefImpl(); + RE = info->O->getRelocation(Rel); + if (info->O->isRelocationScattered(RE)) + continue; + isExtern = info->O->getPlainRelocationExternal(RE); + if (isExtern) { + symbol_iterator RelocSym = Reloc.getSymbol(); + Symbol = *RelocSym; + } + reloc_found = true; + break; + } + } + // If there is an external relocation entry for a symbol in a section + // then used that symbol's value for the value of the reference. + if (reloc_found && isExtern) { + if (info->O->getAnyRelocationPCRel(RE)) { + unsigned Type = info->O->getAnyRelocationType(RE); + if (Type == MachO::X86_64_RELOC_SIGNED) { + Symbol.getAddress(ReferenceValue); + } + } + } + + // TODO: the code to look for other literals such as Objective-C CFStrings + // refs, Selector refs, Message refs, Class refs will be added here. + + const char *name = GuessCstringPointer(ReferenceValue, info); + if (name) { + // TODO: note when the code is added above for Selector refs and Message + // refs we will need check for that here and set the ReferenceType + // accordingly. + *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr; + return name; + } + + // TODO: look for an indirect symbol with this ReferenceValue which is in + // a literal pool. + + return nullptr; +} + // SymbolizerSymbolLookUp is the symbol lookup function passed when creating -// the Symbolizer. It looks up the SymbolValue using the info passed via the +// the Symbolizer. It looks up the ReferenceValue using the info passed via the // pointer to the struct DisassembleInfo that was passed when MCSymbolizer // is created and returns the symbol name that matches the ReferenceValue or // nullptr if none. The ReferenceType is passed in for the IN type of @@ -364,7 +509,7 @@ int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset, // stub is returned indirectly through ReferenceName and then ReferenceType is // set to LLVMDisassembler_ReferenceType_Out_SymbolStub. // -// When this is called with an value loaded via a PC relative load then +// When this is called with an value loaded via a PC relative load then // ReferenceType will be LLVMDisassembler_ReferenceType_In_PCrel_Load then the // SymbolValue is checked to be an address of literal pointer, symbol pointer, // or an Objective-C meta data reference. If so the output ReferenceType is @@ -374,20 +519,89 @@ const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue, uint64_t ReferencePC, const char **ReferenceName) { struct DisassembleInfo *info = (struct DisassembleInfo *)DisInfo; - *ReferenceName = nullptr; - *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; - unsigned int Arch = info->O->getArch(); - if (Arch == Triple::x86) { - return nullptr; - } else if (Arch == Triple::x86_64) { - return nullptr; - } else if (Arch == Triple::arm) { - return nullptr; - } else if (Arch == Triple::aarch64) { - return nullptr; - } else { + // If no verbose symbolic information is wanted then just return nullptr. + if (info->verbose == false) { + *ReferenceName = nullptr; + *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; return nullptr; } + + const char *SymbolName = nullptr; + StringRef name = info->AddrMap->lookup(ReferenceValue); + if (!name.empty()) + SymbolName = name.data(); + + if (*ReferenceType == LLVMDisassembler_ReferenceType_In_PCrel_Load) { + *ReferenceName = GuessLiteralPointer(ReferenceValue, ReferencePC, + ReferenceType, info); + if (*ReferenceName == nullptr) + *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; + // TODO: other types of references to be added. + } else { + *ReferenceName = nullptr; + *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None; + } + + return SymbolName; +} + +// +// This is the memory object used by DisAsm->getInstruction() which has its +// BasePC. This then allows the 'address' parameter to getInstruction() to +// be the actual PC of the instruction. Then when a branch dispacement is +// added to the PC of an instruction, the 'ReferenceValue' passed to the +// SymbolizerSymbolLookUp() routine is the correct target addresses. As in +// the case of a fully linked Mach-O file where a section being disassembled +// generally not linked at address zero. +// +class DisasmMemoryObject : public MemoryObject { + uint8_t *Bytes; + uint64_t Size; + uint64_t BasePC; +public: + DisasmMemoryObject(uint8_t *bytes, uint64_t size, uint64_t basePC) : + Bytes(bytes), Size(size), BasePC(basePC) {} + + uint64_t getBase() const override { return BasePC; } + uint64_t getExtent() const override { return Size; } + + int readByte(uint64_t Addr, uint8_t *Byte) const override { + if (Addr - BasePC >= Size) + return -1; + *Byte = Bytes[Addr - BasePC]; + return 0; + } +}; + +/// \brief Emits the comments that are stored in the CommentStream. +/// Each comment in the CommentStream must end with a newline. +static void emitComments(raw_svector_ostream &CommentStream, + SmallString<128> &CommentsToEmit, + formatted_raw_ostream &FormattedOS, + const MCAsmInfo &MAI) { + // Flush the stream before taking its content. + CommentStream.flush(); + StringRef Comments = CommentsToEmit.str(); + // Get the default information for printing a comment. + const char *CommentBegin = MAI.getCommentString(); + unsigned CommentColumn = MAI.getCommentColumn(); + bool IsFirst = true; + while (!Comments.empty()) { + if (!IsFirst) + FormattedOS << '\n'; + // Emit a line of comments. + FormattedOS.PadToColumn(CommentColumn); + size_t Position = Comments.find('\n'); + FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position); + // Move after the newline character. + Comments = Comments.substr(Position + 1); + IsFirst = false; + } + FormattedOS.flush(); + + // Tell the comment stream that the vector changed underneath it. + CommentsToEmit.clear(); + CommentStream.resync(); } static void DisassembleInputMachO2(StringRef Filename, @@ -445,6 +659,12 @@ static void DisassembleInputMachO2(StringRef Filename, int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); std::unique_ptr IP(TheTarget->createMCInstPrinter( AsmPrinterVariant, *AsmInfo, *InstrInfo, *MRI, *STI)); + // Set the display preference for hex vs. decimal immediates. + IP->setPrintImmHex(PrintImmHex); + // Comment stream and backing vector. + SmallString<128> CommentsToEmit; + raw_svector_ostream CommentStream(CommentsToEmit); + IP->setCommentStream(CommentStream); if (!InstrAnalysis || !AsmInfo || !STI || !DisAsm || !IP) { errs() << "error: couldn't initialize disassembler for target " @@ -467,11 +687,13 @@ static void DisassembleInputMachO2(StringRef Filename, ThumbTarget->createMCSubtargetInfo(ThumbTripleName, MCPU, FeaturesStr)); ThumbCtx.reset(new MCContext(ThumbAsmInfo.get(), ThumbMRI.get(), nullptr)); ThumbDisAsm.reset(ThumbTarget->createMCDisassembler(*ThumbSTI, *ThumbCtx)); -// TODO: add MCSymbolizer here for the ThumbTarget like above for TheTarget. + // TODO: add MCSymbolizer here for the ThumbTarget like above for TheTarget. int ThumbAsmPrinterVariant = ThumbAsmInfo->getAssemblerDialect(); ThumbIP.reset(ThumbTarget->createMCInstPrinter( ThumbAsmPrinterVariant, *ThumbAsmInfo, *ThumbInstrInfo, *ThumbMRI, *ThumbSTI)); + // Set the display preference for hex vs. decimal immediates. + ThumbIP->setPrintImmHex(PrintImmHex); } if (ThumbTarget && (!ThumbInstrAnalysis || !ThumbAsmInfo || !ThumbSTI || @@ -564,7 +786,10 @@ static void DisassembleInputMachO2(StringRef Filename, StringRef Bytes; Sections[SectIdx].getContents(Bytes); - StringRefMemoryObject memoryObject(Bytes); + uint64_t SectAddress = 0; + Sections[SectIdx].getAddress(SectAddress); + DisasmMemoryObject MemoryObject((uint8_t *)Bytes.data(), Bytes.size(), + SectAddress); bool symbolTableWorked = false; // Parse relocations. @@ -581,10 +806,26 @@ static void DisassembleInputMachO2(StringRef Filename, } array_pod_sort(Relocs.begin(), Relocs.end()); + // Create a map of symbol addresses to symbol names for use by + // the SymbolizerSymbolLookUp() routine. + SymbolAddressMap AddrMap; + for (const SymbolRef &Symbol : MachOOF->symbols()) { + SymbolRef::Type ST; + Symbol.getType(ST); + if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data || + ST == SymbolRef::ST_Other) { + uint64_t Address; + Symbol.getAddress(Address); + StringRef SymName; + Symbol.getName(SymName); + AddrMap[Address] = SymName; + } + } // Set up the block of info used by the Symbolizer call backs. SymbolizerInfo.verbose = true; SymbolizerInfo.O = MachOOF; SymbolizerInfo.S = Sections[SectIdx]; + SymbolizerInfo.AddrMap = &AddrMap; // Disassemble symbol by symbol. for (unsigned SymIdx = 0; SymIdx != Symbols.size(); SymIdx++) { @@ -643,14 +884,22 @@ static void DisassembleInputMachO2(StringRef Filename, for (uint64_t Index = Start; Index < End; Index += Size) { MCInst Inst; - uint64_t SectAddress = 0; - Sections[SectIdx].getAddress(SectAddress); - outs() << format("%8" PRIx64 ":\t", SectAddress + Index); + uint64_t PC = SectAddress + Index; + if (FullLeadingAddr) { + if (MachOOF->is64Bit()) + outs() << format("%016" PRIx64, PC); + else + outs() << format("%08" PRIx64, PC); + } else { + outs() << format("%8" PRIx64 ":", PC); + } + if (!NoShowRawInsn) + outs() << "\t"; // Check the data in code table here to see if this is data not an // instruction to be disassembled. DiceTable Dice; - Dice.push_back(std::make_pair(SectAddress + Index, DiceRef())); + Dice.push_back(std::make_pair(PC, DiceRef())); dice_table_iterator DTI = std::search(Dices.begin(), Dices.end(), Dice.begin(), Dice.end(), compareDiceTableEntries); @@ -664,24 +913,33 @@ static void DisassembleInputMachO2(StringRef Filename, continue; } + SmallVector AnnotationsBytes; + raw_svector_ostream Annotations(AnnotationsBytes); + bool gotInst; if (isThumb) - gotInst = ThumbDisAsm->getInstruction(Inst, Size, memoryObject, Index, - DebugOut, nulls()); + gotInst = ThumbDisAsm->getInstruction(Inst, Size, MemoryObject, PC, + DebugOut, Annotations); else - gotInst = DisAsm->getInstruction(Inst, Size, memoryObject, Index, - DebugOut, nulls()); + gotInst = DisAsm->getInstruction(Inst, Size, MemoryObject, PC, + DebugOut, Annotations); if (gotInst) { - DumpBytes(StringRef(Bytes.data() + Index, Size)); + if (!NoShowRawInsn) { + DumpBytes(StringRef(Bytes.data() + Index, Size)); + } + formatted_raw_ostream FormattedOS(outs()); + Annotations.flush(); + StringRef AnnotationsStr = Annotations.str(); if (isThumb) - ThumbIP->printInst(&Inst, outs(), ""); + ThumbIP->printInst(&Inst, FormattedOS, AnnotationsStr); else - IP->printInst(&Inst, outs(), ""); + IP->printInst(&Inst, FormattedOS, AnnotationsStr); + emitComments(CommentStream, CommentsToEmit, FormattedOS, *AsmInfo); // Print debug info. if (diContext) { DILineInfo dli = - diContext->getLineInfoForAddress(SectAddress + Index); + diContext->getLineInfoForAddress(PC); // Print valid line info if it changed. if (dli != lastLine && dli.Line != 0) outs() << "\t## " << dli.FileName << ':' << dli.Line << ':' @@ -706,10 +964,21 @@ static void DisassembleInputMachO2(StringRef Filename, for (uint64_t Index = 0; Index < SectSize; Index += InstSize) { MCInst Inst; - if (DisAsm->getInstruction(Inst, InstSize, memoryObject, Index, + uint64_t PC = SectAddress + Index; + if (DisAsm->getInstruction(Inst, InstSize, MemoryObject, PC, DebugOut, nulls())) { - outs() << format("%8" PRIx64 ":\t", SectAddress + Index); - DumpBytes(StringRef(Bytes.data() + Index, InstSize)); + if (FullLeadingAddr) { + if (MachOOF->is64Bit()) + outs() << format("%016" PRIx64, PC); + else + outs() << format("%08" PRIx64, PC); + } else { + outs() << format("%8" PRIx64 ":", PC); + } + if (!NoShowRawInsn) { + outs() << "\t"; + DumpBytes(StringRef(Bytes.data() + Index, InstSize)); + } IP->printInst(&Inst, outs(), ""); outs() << "\n"; } else { diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp index 12cf1f72b09..4c753da7bf4 100644 --- a/tools/llvm-objdump/llvm-objdump.cpp +++ b/tools/llvm-objdump/llvm-objdump.cpp @@ -128,9 +128,10 @@ llvm::MAttrs("mattr", cl::desc("Target specific attributes"), cl::value_desc("a1,+a2,-a3,...")); -static cl::opt -NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling instructions, " - "do not print the instruction bytes.")); +cl::opt +llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling " + "instructions, do not print " + "the instruction bytes.")); static cl::opt UnwindInfo("unwind-info", cl::desc("Display unwind information")); diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h index cae10354a91..5ecbb12c6b1 100644 --- a/tools/llvm-objdump/llvm-objdump.h +++ b/tools/llvm-objdump/llvm-objdump.h @@ -27,6 +27,7 @@ extern cl::opt TripleName; extern cl::opt ArchName; extern cl::opt MCPU; extern cl::list MAttrs; +extern cl::opt NoShowRawInsn; // Various helper functions. bool error(std::error_code ec);