//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Implement the Lexer for TableGen. // //===----------------------------------------------------------------------===// #include "TGLexer.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/Streams.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Config/config.h" #include #include #include #include #include using namespace llvm; TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { CurBuffer = 0; CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); CurPtr = CurBuf->getBufferStart(); TokStart = 0; } SMLoc TGLexer::getLoc() const { return SMLoc::getFromPointer(TokStart); } /// ReturnError - Set the error to the specified string at the specified /// location. This is defined to always return tgtok::Error. tgtok::TokKind TGLexer::ReturnError(const char *Loc, const std::string &Msg) { PrintError(Loc, Msg); return tgtok::Error; } void TGLexer::PrintError(const char *Loc, const std::string &Msg) const { SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), Msg, "error"); } void TGLexer::PrintError(SMLoc Loc, const std::string &Msg) const { SrcMgr.PrintMessage(Loc, Msg, "error"); } int TGLexer::getNextChar() { char CurChar = *CurPtr++; switch (CurChar) { default: return (unsigned char)CurChar; case 0: { // A nul character in the stream is either the end of the current buffer or // a random nul in the file. Disambiguate that here. if (CurPtr-1 != CurBuf->getBufferEnd()) return 0; // Just whitespace. // If this is the end of an included file, pop the parent file off the // include stack. SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); if (ParentIncludeLoc != SMLoc()) { CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); CurPtr = ParentIncludeLoc.getPointer(); return getNextChar(); } // Otherwise, return end of file. --CurPtr; // Another call to lex will return EOF again. return EOF; } case '\n': case '\r': // Handle the newline character by ignoring it and incrementing the line // count. However, be careful about 'dos style' files with \n\r in them. // Only treat a \n\r or \r\n as a single line. if ((*CurPtr == '\n' || (*CurPtr == '\r')) && *CurPtr != CurChar) ++CurPtr; // Eat the two char newline sequence. return '\n'; } } tgtok::TokKind TGLexer::LexToken() { TokStart = CurPtr; // This always consumes at least one character. int CurChar = getNextChar(); switch (CurChar) { default: // Handle letters: [a-zA-Z_] if (isalpha(CurChar) || CurChar == '_' || CurChar == '#') return LexIdentifier(); // Unknown character, emit an error. return ReturnError(TokStart, "Unexpected character"); case EOF: return tgtok::Eof; case ':': return tgtok::colon; case ';': return tgtok::semi; case '.': return tgtok::period; case ',': return tgtok::comma; case '<': return tgtok::less; case '>': return tgtok::greater; case ']': return tgtok::r_square; case '{': return tgtok::l_brace; case '}': return tgtok::r_brace; case '(': return tgtok::l_paren; case ')': return tgtok::r_paren; case '=': return tgtok::equal; case '?': return tgtok::question; case 0: case ' ': case '\t': case '\n': case '\r': // Ignore whitespace. return LexToken(); case '/': // If this is the start of a // comment, skip until the end of the line or // the end of the buffer. if (*CurPtr == '/') SkipBCPLComment(); else if (*CurPtr == '*') { if (SkipCComment()) return tgtok::Error; } else // Otherwise, this is an error. return ReturnError(TokStart, "Unexpected character"); return LexToken(); case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return LexNumber(); case '"': return LexString(); case '$': return LexVarName(); case '[': return LexBracket(); case '!': return LexExclaim(); } } /// LexString - Lex "[^"]*" tgtok::TokKind TGLexer::LexString() { const char *StrStart = CurPtr; CurStrVal = ""; while (*CurPtr != '"') { // If we hit the end of the buffer, report an error. if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) return ReturnError(StrStart, "End of file in string literal"); if (*CurPtr == '\n' || *CurPtr == '\r') return ReturnError(StrStart, "End of line in string literal"); if (*CurPtr != '\\') { CurStrVal += *CurPtr++; continue; } ++CurPtr; switch (*CurPtr) { case '\\': case '\'': case '"': // These turn into their literal character. CurStrVal += *CurPtr++; break; case 't': CurStrVal += '\t'; ++CurPtr; break; case 'n': CurStrVal += '\n'; ++CurPtr; break; case '\n': case '\r': return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); // If we hit the end of the buffer, report an error. case '\0': if (CurPtr == CurBuf->getBufferEnd()) return ReturnError(StrStart, "End of file in string literal"); // FALL THROUGH default: return ReturnError(CurPtr, "invalid escape in string literal"); } } ++CurPtr; return tgtok::StrVal; } tgtok::TokKind TGLexer::LexVarName() { if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') return ReturnError(TokStart, "Invalid variable name"); // Otherwise, we're ok, consume the rest of the characters. const char *VarNameStart = CurPtr++; while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') ++CurPtr; CurStrVal.assign(VarNameStart, CurPtr); return tgtok::VarName; } tgtok::TokKind TGLexer::LexIdentifier() { // The first letter is [a-zA-Z_]. const char *IdentStart = TokStart; // Match the rest of the identifier regex: [0-9a-zA-Z_]* while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' || *CurPtr == '#') { // If this contains a '#', make sure it's value if (*CurPtr == '#') { if (strncmp(CurPtr, "#NAME#", 6) != 0) { return tgtok::Error; } CurPtr += 6; } else { ++CurPtr; } } // Check to see if this identifier is a keyword. unsigned Len = CurPtr-IdentStart; if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int; if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit; if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits; if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String; if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List; if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code; if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag; if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class; if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def; if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm; if (Len == 10 && !memcmp(IdentStart, "multiclass", 10)) return tgtok::MultiClass; if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field; if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let; if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In; if (Len == 7 && !memcmp(IdentStart, "include", 7)) { if (LexInclude()) return tgtok::Error; return Lex(); } CurStrVal.assign(IdentStart, CurPtr); return tgtok::Id; } /// LexInclude - We just read the "include" token. Get the string token that /// comes next and enter the include. bool TGLexer::LexInclude() { // The token after the include must be a string. tgtok::TokKind Tok = LexToken(); if (Tok == tgtok::Error) return true; if (Tok != tgtok::StrVal) { PrintError(getLoc(), "Expected filename after include"); return true; } // Get the string. std::string Filename = CurStrVal; CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr)); if (CurBuffer == -1) { PrintError(getLoc(), "Could not find include file '" + Filename + "'"); return true; } // Save the line number and lex buffer of the includer. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); CurPtr = CurBuf->getBufferStart(); return false; } void TGLexer::SkipBCPLComment() { ++CurPtr; // skip the second slash. while (1) { switch (*CurPtr) { case '\n': case '\r': return; // Newline is end of comment. case 0: // If this is the end of the buffer, end the comment. if (CurPtr == CurBuf->getBufferEnd()) return; break; } // Otherwise, skip the character. ++CurPtr; } } /// SkipCComment - This skips C-style /**/ comments. The only difference from C /// is that we allow nesting. bool TGLexer::SkipCComment() { ++CurPtr; // skip the star. unsigned CommentDepth = 1; while (1) { int CurChar = getNextChar(); switch (CurChar) { case EOF: PrintError(TokStart, "Unterminated comment!"); return true; case '*': // End of the comment? if (CurPtr[0] != '/') break; ++CurPtr; // End the */. if (--CommentDepth == 0) return false; break; case '/': // Start of a nested comment? if (CurPtr[0] != '*') break; ++CurPtr; ++CommentDepth; break; } } } /// LexNumber - Lex: /// [-+]?[0-9]+ /// 0x[0-9a-fA-F]+ /// 0b[01]+ tgtok::TokKind TGLexer::LexNumber() { if (CurPtr[-1] == '0') { if (CurPtr[0] == 'x') { ++CurPtr; const char *NumStart = CurPtr; while (isxdigit(CurPtr[0])) ++CurPtr; // Requires at least one hex digit. if (CurPtr == NumStart) return ReturnError(TokStart, "Invalid hexadecimal number"); errno = 0; CurIntVal = strtoll(NumStart, 0, 16); if (errno == EINVAL) return ReturnError(TokStart, "Invalid hexadecimal number"); if (errno == ERANGE) { errno = 0; CurIntVal = (int64_t)strtoull(NumStart, 0, 16); if (errno == EINVAL) return ReturnError(TokStart, "Invalid hexadecimal number"); if (errno == ERANGE) return ReturnError(TokStart, "Hexadecimal number out of range"); } return tgtok::IntVal; } else if (CurPtr[0] == 'b') { ++CurPtr; const char *NumStart = CurPtr; while (CurPtr[0] == '0' || CurPtr[0] == '1') ++CurPtr; // Requires at least one binary digit. if (CurPtr == NumStart) return ReturnError(CurPtr-2, "Invalid binary number"); CurIntVal = strtoll(NumStart, 0, 2); return tgtok::IntVal; } } // Check for a sign without a digit. if (!isdigit(CurPtr[0])) { if (CurPtr[-1] == '-') return tgtok::minus; else if (CurPtr[-1] == '+') return tgtok::plus; } while (isdigit(CurPtr[0])) ++CurPtr; CurIntVal = strtoll(TokStart, 0, 10); return tgtok::IntVal; } /// LexBracket - We just read '['. If this is a code block, return it, /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' tgtok::TokKind TGLexer::LexBracket() { if (CurPtr[0] != '{') return tgtok::l_square; ++CurPtr; const char *CodeStart = CurPtr; while (1) { int Char = getNextChar(); if (Char == EOF) break; if (Char != '}') continue; Char = getNextChar(); if (Char == EOF) break; if (Char == ']') { CurStrVal.assign(CodeStart, CurPtr-2); return tgtok::CodeFragment; } } return ReturnError(CodeStart-2, "Unterminated Code Block"); } /// LexExclaim - Lex '!' and '![a-zA-Z]+'. tgtok::TokKind TGLexer::LexExclaim() { if (!isalpha(*CurPtr)) return ReturnError(CurPtr-1, "Invalid \"!operator\""); const char *Start = CurPtr++; while (isalpha(*CurPtr)) ++CurPtr; // Check to see which operator this is. unsigned Len = CurPtr-Start; if (Len == 3 && !memcmp(Start, "con", 3)) return tgtok::XConcat; if (Len == 3 && !memcmp(Start, "sra", 3)) return tgtok::XSRA; if (Len == 3 && !memcmp(Start, "srl", 3)) return tgtok::XSRL; if (Len == 3 && !memcmp(Start, "shl", 3)) return tgtok::XSHL; if (Len == 9 && !memcmp(Start, "strconcat", 9)) return tgtok::XStrConcat; if (Len == 10 && !memcmp(Start, "nameconcat", 10)) return tgtok::XNameConcat; if (Len == 5 && !memcmp(Start, "subst", 5)) return tgtok::XSubst; if (Len == 7 && !memcmp(Start, "foreach", 7)) return tgtok::XForEach; if (Len == 4 && !memcmp(Start, "cast", 4)) return tgtok::XCast; if (Len == 3 && !memcmp(Start, "car", 3)) return tgtok::XCar; if (Len == 3 && !memcmp(Start, "cdr", 3)) return tgtok::XCdr; if (Len == 4 && !memcmp(Start, "null", 4)) return tgtok::XNull; if (Len == 2 && !memcmp(Start, "if", 2)) return tgtok::XIf; return ReturnError(Start-1, "Unknown operator"); }