From 4651bca31bdad27184fa0d36640bf5ef1d83cf5c Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Sun, 21 Jun 2009 19:21:25 +0000 Subject: [PATCH] implement enough of a lexer to get through Olden/health/Output/health.llc.s without errors. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73855 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-mc/AsmLexer.cpp | 162 +++++++++++++++++++++++++++++++++++-- tools/llvm-mc/AsmLexer.h | 16 +++- tools/llvm-mc/llvm-mc.cpp | 16 +++- 3 files changed, 184 insertions(+), 10 deletions(-) diff --git a/tools/llvm-mc/AsmLexer.cpp b/tools/llvm-mc/AsmLexer.cpp index da86465d7fe..578eec18526 100644 --- a/tools/llvm-mc/AsmLexer.cpp +++ b/tools/llvm-mc/AsmLexer.cpp @@ -14,6 +14,7 @@ #include "AsmLexer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/MemoryBuffer.h" +#include using namespace llvm; AsmLexer::AsmLexer(SourceMgr &SM) : SrcMgr(SM) { @@ -23,6 +24,10 @@ AsmLexer::AsmLexer(SourceMgr &SM) : SrcMgr(SM) { TokStart = 0; } +SMLoc AsmLexer::getLoc() const { + return SMLoc::getFromPointer(TokStart); +} + void AsmLexer::PrintError(const char *Loc, const std::string &Msg) const { SrcMgr.PrintError(SMLoc::getFromPointer(Loc), Msg); } @@ -31,6 +36,13 @@ void AsmLexer::PrintError(SMLoc Loc, const std::string &Msg) const { SrcMgr.PrintError(Loc, Msg); } +/// ReturnError - Set the error to the specified string at the specified +/// location. This is defined to always return asmtok::Error. +asmtok::TokKind AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { + PrintError(Loc, Msg); + return asmtok::Error; +} + int AsmLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { @@ -59,6 +71,129 @@ int AsmLexer::getNextChar() { } } +/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* +asmtok::TokKind AsmLexer::LexIdentifier() { + while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$' || + *CurPtr == '.' || *CurPtr == '@') + ++CurPtr; + CurStrVal.assign(TokStart, CurPtr); // Skip % + return asmtok::Identifier; +} + +/// LexPercent: Register: %[a-zA-Z0-9]+ +asmtok::TokKind AsmLexer::LexPercent() { + if (!isalnum(*CurPtr)) + return asmtok::Error; // Must have at least one character. + while (isalnum(*CurPtr)) + ++CurPtr; + CurStrVal.assign(TokStart, CurPtr); // Skip % + return asmtok::Register; +} + +/// LexSlash: Slash: / +/// C-Style Comment: /* ... */ +asmtok::TokKind AsmLexer::LexSlash() { + if (*CurPtr != '*') + return asmtok::Slash; + + // C Style comment. + ++CurPtr; // skip the star. + while (1) { + int CurChar = getNextChar(); + switch (CurChar) { + case EOF: + PrintError(TokStart, "Unterminated comment!"); + return asmtok::Error; + case '*': + // End of the comment? + if (CurPtr[0] != '/') break; + + ++CurPtr; // End the */. + return LexToken(); + } + } +} + +/// LexHash: Comment: #[^\n]* +asmtok::TokKind AsmLexer::LexHash() { + int CurChar = getNextChar(); + while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF) + CurChar = getNextChar(); + + if (CurChar == EOF) + return asmtok::Eof; + return asmtok::EndOfStatement; +} + + +/// LexDigit: First character is [0-9]. +/// Local Label: [0-9][:] +/// Forward/Backward Label: [0-9][fb] +/// Binary integer: 0b[01]+ +/// Octal integer: 0[0-7]+ +/// Hex integer: 0x[0-9a-fA-F]+ +/// Decimal integer: [1-9][0-9]* +/// TODO: FP literal. +asmtok::TokKind AsmLexer::LexDigit() { + if (*CurPtr == ':') + return asmtok::Error; // FIXME LOCAL LABEL. + if (*CurPtr == 'f' || *CurPtr == 'b') + return asmtok::Error; // FIXME FORWARD/BACKWARD LABEL. + + // Decimal integer: [1-9][0-9]* + if (CurPtr[-1] != '0') { + while (isdigit(*CurPtr)) + ++CurPtr; + CurIntVal = strtoll(TokStart, 0, 10); + return asmtok::IntVal; + } + + if (*CurPtr == 'b') { + ++CurPtr; + const char *NumStart = CurPtr; + while (CurPtr[0] == '0' || CurPtr[0] == '1') + ++CurPtr; + + // Requires at least one binary digit. + if (CurPtr == NumStart) + return ReturnError(CurPtr-2, "Invalid binary number"); + CurIntVal = strtoll(NumStart, 0, 2); + return asmtok::IntVal; + } + + if (*CurPtr == 'x') { + ++CurPtr; + const char *NumStart = CurPtr; + while (isxdigit(CurPtr[0])) + ++CurPtr; + + // Requires at least one hex digit. + if (CurPtr == NumStart) + return ReturnError(CurPtr-2, "Invalid hexadecimal number"); + + errno = 0; + CurIntVal = strtoll(NumStart, 0, 16); + if (errno == EINVAL) + return ReturnError(CurPtr-2, "Invalid hexadecimal number"); + if (errno == ERANGE) { + errno = 0; + CurIntVal = (int64_t)strtoull(NumStart, 0, 16); + if (errno == EINVAL) + return ReturnError(CurPtr-2, "Invalid hexadecimal number"); + if (errno == ERANGE) + return ReturnError(CurPtr-2, "Hexadecimal number out of range"); + } + return asmtok::IntVal; + } + + // Must be an octal number, it starts with 0. + while (*CurPtr >= '0' && *CurPtr <= '7') + ++CurPtr; + CurIntVal = strtoll(TokStart, 0, 8); + return asmtok::IntVal; +} + + asmtok::TokKind AsmLexer::LexToken() { TokStart = CurPtr; // This always consumes at least one character. @@ -66,9 +201,9 @@ asmtok::TokKind AsmLexer::LexToken() { switch (CurChar) { default: - // Handle letters: [a-zA-Z_] -// if (isalpha(CurChar) || CurChar == '_' || CurChar == '#') -// return LexIdentifier(); + // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* + if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') + return LexIdentifier(); // Unknown character, emit an error. return asmtok::Error; @@ -76,12 +211,29 @@ asmtok::TokKind AsmLexer::LexToken() { case 0: case ' ': case '\t': - case '\n': - case '\r': // Ignore whitespace. return LexToken(); + case '\n': // FALL THROUGH. + case '\r': // FALL THROUGH. + case ';': return asmtok::EndOfStatement; case ':': return asmtok::Colon; case '+': return asmtok::Plus; case '-': return asmtok::Minus; + case '(': return asmtok::LParen; + case ')': return asmtok::RParen; + case '*': return asmtok::Star; + case ',': return asmtok::Comma; + case '$': return asmtok::Dollar; + case '%': return LexPercent(); + case '/': return LexSlash(); + case '#': return LexHash(); + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return LexDigit(); + + // TODO: Quoted identifiers (objc methods etc) + // local labels: [0-9][:] + // Forward/backward labels: [0-9][fb] + // Integers, fp constants, character constants. } } \ No newline at end of file diff --git a/tools/llvm-mc/AsmLexer.h b/tools/llvm-mc/AsmLexer.h index 08e6f9c6eeb..9e694c7a301 100644 --- a/tools/llvm-mc/AsmLexer.h +++ b/tools/llvm-mc/AsmLexer.h @@ -29,12 +29,16 @@ namespace asmtok { Eof, Error, Identifier, + Register, IntVal, - + EndOfStatement, Colon, Plus, - Minus + Minus, + Slash, // '/' + LParen, RParen, + Star, Comma, Dollar }; } @@ -66,7 +70,7 @@ public: asmtok::TokKind getKind() const { return CurKind; } const std::string &getCurStrVal() const { - assert(CurKind == asmtok::Identifier && + assert((CurKind == asmtok::Identifier || CurKind == asmtok::Register) && "This token doesn't have a string value"); return CurStrVal; } @@ -82,9 +86,15 @@ public: private: int getNextChar(); + asmtok::TokKind ReturnError(const char *Loc, const std::string &Msg); /// LexToken - Read the next token and return its code. asmtok::TokKind LexToken(); + asmtok::TokKind LexIdentifier(); + asmtok::TokKind LexPercent(); + asmtok::TokKind LexSlash(); + asmtok::TokKind LexHash(); + asmtok::TokKind LexDigit(); }; } // end namespace llvm diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp index 83642988e37..20f353ca670 100644 --- a/tools/llvm-mc/llvm-mc.cpp +++ b/tools/llvm-mc/llvm-mc.cpp @@ -72,17 +72,29 @@ static int AssembleInput(const char *ProgName) { asmtok::TokKind Tok = Lexer.Lex(); while (Tok != asmtok::Eof) { switch (Tok) { - default: outs() << "<>\n"; break; - case asmtok::Error: outs() << "<>\n"; break; + default: Lexer.PrintError(Lexer.getLoc(), "driver: unknown token"); break; + case asmtok::Error: + Lexer.PrintError(Lexer.getLoc(), "error, bad token"); + break; case asmtok::Identifier: outs() << "identifier: " << Lexer.getCurStrVal() << '\n'; break; + case asmtok::Register: + outs() << "register: " << Lexer.getCurStrVal() << '\n'; + break; case asmtok::IntVal: outs() << "int: " << Lexer.getCurIntVal() << '\n'; break; + case asmtok::EndOfStatement: outs() << "EndOfStatement\n"; break; case asmtok::Colon: outs() << "Colon\n"; break; case asmtok::Plus: outs() << "Plus\n"; break; case asmtok::Minus: outs() << "Minus\n"; break; + case asmtok::Slash: outs() << "Slash\n"; break; + case asmtok::LParen: outs() << "LParen\n"; break; + case asmtok::RParen: outs() << "RParen\n"; break; + case asmtok::Star: outs() << "Star\n"; break; + case asmtok::Comma: outs() << "Comma\n"; break; + case asmtok::Dollar: outs() << "Dollar\n"; break; } Tok = Lexer.Lex();