/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set ts=8 sts=4 et sw=4 tw=99: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #ifndef frontend_TokenStream_h #define frontend_TokenStream_h // JS lexical scanner interface. #include "mozilla/ArrayUtils.h" #include "mozilla/Assertions.h" #include "mozilla/Attributes.h" #include "mozilla/DebugOnly.h" #include "mozilla/PodOperations.h" #include "mozilla/UniquePtr.h" #include #include #include #include "jscntxt.h" #include "jspubtd.h" #include "frontend/TokenKind.h" #include "js/Vector.h" #include "vm/RegExpObject.h" struct KeywordInfo; namespace js { namespace frontend { class AutoAwaitIsKeyword; struct TokenPos { uint32_t begin; // Offset of the token's first char. uint32_t end; // Offset of 1 past the token's last char. TokenPos() {} TokenPos(uint32_t begin, uint32_t end) : begin(begin), end(end) {} // Return a TokenPos that covers left, right, and anything in between. static TokenPos box(const TokenPos& left, const TokenPos& right) { MOZ_ASSERT(left.begin <= left.end); MOZ_ASSERT(left.end <= right.begin); MOZ_ASSERT(right.begin <= right.end); return TokenPos(left.begin, right.end); } bool operator==(const TokenPos& bpos) const { return begin == bpos.begin && end == bpos.end; } bool operator!=(const TokenPos& bpos) const { return begin != bpos.begin || end != bpos.end; } bool operator <(const TokenPos& bpos) const { return begin < bpos.begin; } bool operator <=(const TokenPos& bpos) const { return begin <= bpos.begin; } bool operator >(const TokenPos& bpos) const { return !(*this <= bpos); } bool operator >=(const TokenPos& bpos) const { return !(*this < bpos); } bool encloses(const TokenPos& pos) const { return begin <= pos.begin && pos.end <= end; } }; enum DecimalPoint { NoDecimal = false, HasDecimal = true }; class TokenStream; struct Token { private: // Sometimes the parser needs to inform the tokenizer to interpret // subsequent text in a particular manner: for example, to tokenize a // keyword as an identifier, not as the actual keyword, on the right-hand // side of a dotted property access. Such information is communicated to // the tokenizer as a Modifier when getting the next token. // // Ideally this definition would reside in TokenStream as that's the real // user, but the debugging-use of it here causes a cyclic dependency (and // C++ provides no way to forward-declare an enum inside a class). So // define it here, then typedef it into TokenStream with static consts to // bring the initializers into scope. enum Modifier { // Normal operation. None, // Looking for an operand, not an operator. In practice, this means // that when '/' is seen, we look for a regexp instead of just returning // TOK_DIV. Operand, // Treat keywords as names by returning TOK_NAME. KeywordIsName, // Treat subsequent characters as the tail of a template literal, after // a template substitution, beginning with a "}", continuing with zero // or more template literal characters, and ending with either "${" or // the end of the template literal. For example: // // var entity = "world"; // var s = `Hello ${entity}!`; // ^ TemplateTail context TemplateTail, }; enum ModifierException { NoException, // Used in following 2 cases: // a) After |yield| we look for a token on the same line that starts an // expression (Operand): |yield |. If no token is found, the // |yield| stands alone, and the next token on a subsequent line must // be: a comma continuing a comma expression, a semicolon terminating // the statement that ended with |yield|, or the start of another // statement (possibly an expression statement). The comma/semicolon // cases are gotten as operators (None), contrasting with Operand // earlier. // b) After an arrow function with a block body in an expression // statement, the next token must be: a colon in a conditional // expression, a comma continuing a comma expression, a semicolon // terminating the statement, or the token on a subsequent line that is // the start of another statement (possibly an expression statement). // Colon is gotten as operator (None), and it should only be gotten in // conditional expression and missing it results in SyntaxError. // Comma/semicolon cases are also gotten as operators (None), and 4th // case is gotten after them. If no comma/semicolon found but EOL, // the next token should be gotten as operand in 4th case (especially if // '/' is the first character). So we should peek the token as // operand before try getting colon/comma/semicolon. // See also the comment in Parser::assignExpr(). NoneIsOperand, // If a semicolon is inserted automatically, the next token is already // gotten with None, but we expect Operand. OperandIsNone, // If name of method definition is `get` or `set`, the next token is // already gotten with KeywordIsName, but we expect None. NoneIsKeywordIsName, }; friend class TokenStream; public: TokenKind type; // char value or above enumerator TokenPos pos; // token position in file union { private: friend struct Token; PropertyName* name; // non-numeric atom JSAtom* atom; // potentially-numeric atom struct { double value; // floating point number DecimalPoint decimalPoint; // literal contains '.' } number; RegExpFlag reflags; // regexp flags; use tokenbuf to access // regexp chars } u; #ifdef DEBUG Modifier modifier; // Modifier used to get this token ModifierException modifierException; // Exception for this modifier #endif // This constructor is necessary only for MSVC 2013 and how it compiles the // initialization of TokenStream::tokens. That field is initialized as // tokens() in the constructor init-list. This *should* zero the entire // array, then (because Token has a non-trivial constructor, because // TokenPos has a user-provided constructor) call the implicit Token // constructor on each element, which would call the TokenPos constructor // for Token::pos and do nothing. (All of which is equivalent to just // zeroing TokenStream::tokens.) But MSVC 2013 (2010/2012 don't have this // bug) doesn't zero out each element, so we need this extra constructor to // make it do the right thing. (Token is used primarily by reference or // pointer, and it's only initialized a very few places, so having a // user-defined constructor won't hurt perf.) See also bug 920318. Token() : pos(0, 0) { MOZ_MAKE_MEM_UNDEFINED(&type, sizeof(type)); } // Mutators void setName(PropertyName* name) { MOZ_ASSERT(type == TOK_NAME); u.name = name; } void setAtom(JSAtom* atom) { MOZ_ASSERT(type == TOK_STRING || type == TOK_TEMPLATE_HEAD || type == TOK_NO_SUBS_TEMPLATE); u.atom = atom; } void setRegExpFlags(js::RegExpFlag flags) { MOZ_ASSERT(type == TOK_REGEXP); MOZ_ASSERT((flags & AllFlags) == flags); u.reflags = flags; } void setNumber(double n, DecimalPoint decimalPoint) { MOZ_ASSERT(type == TOK_NUMBER); u.number.value = n; u.number.decimalPoint = decimalPoint; } // Type-safe accessors PropertyName* name() const { MOZ_ASSERT(type == TOK_NAME); return u.name->JSAtom::asPropertyName(); // poor-man's type verification } bool nameContainsEscape() const { PropertyName* n = name(); return pos.begin + n->length() != pos.end; } JSAtom* atom() const { MOZ_ASSERT(type == TOK_STRING || type == TOK_TEMPLATE_HEAD || type == TOK_NO_SUBS_TEMPLATE); return u.atom; } js::RegExpFlag regExpFlags() const { MOZ_ASSERT(type == TOK_REGEXP); MOZ_ASSERT((u.reflags & AllFlags) == u.reflags); return u.reflags; } double number() const { MOZ_ASSERT(type == TOK_NUMBER); return u.number.value; } DecimalPoint decimalPoint() const { MOZ_ASSERT(type == TOK_NUMBER); return u.number.decimalPoint; } }; struct CompileError { JSErrorReport report; char* message; ErrorArgumentsType argumentsType; CompileError() : message(nullptr), argumentsType(ArgumentsAreUnicode) {} ~CompileError(); void throwError(JSContext* cx); private: // CompileError owns raw allocated memory, so disable assignment and copying // for safety. void operator=(const CompileError&) = delete; CompileError(const CompileError&) = delete; }; // Ideally, tokenizing would be entirely independent of context. But the // strict mode flag, which is in SharedContext, affects tokenizing, and // TokenStream needs to see it. // // This class is a tiny back-channel from TokenStream to the strict mode flag // that avoids exposing the rest of SharedContext to TokenStream. // class StrictModeGetter { public: virtual bool strictMode() = 0; }; // TokenStream is the lexical scanner for Javascript source text. // // It takes a buffer of char16_t characters and linearly scans it into |Token|s. // Internally the class uses a four element circular buffer |tokens| of // |Token|s. As an index for |tokens|, the member |cursor| points to the // current token. // Calls to getToken() increase |cursor| by one and return the new current // token. If a TokenStream was just created, the current token is initialized // with random data (i.e. not initialized). It is therefore important that // one of the first four member functions listed below is called first. // The circular buffer lets us go back up to two tokens from the last // scanned token. Internally, the relative number of backward steps that were // taken (via ungetToken()) after the last token was scanned is stored in // |lookahead|. // // The following table lists in which situations it is safe to call each listed // function. No checks are made by the functions in non-debug builds. // // Function Name | Precondition; changes to |lookahead| // ------------------+--------------------------------------------------------- // getToken | none; if |lookahead > 0| then |lookahead--| // peekToken | none; if |lookahead == 0| then |lookahead == 1| // peekTokenSameLine | none; if |lookahead == 0| then |lookahead == 1| // matchToken | none; if |lookahead > 0| and the match succeeds then // | |lookahead--| // consumeKnownToken | none; if |lookahead > 0| then |lookahead--| // ungetToken | 0 <= |lookahead| <= |maxLookahead - 1|; |lookahead++| // // The behavior of the token scanning process (see getTokenInternal()) can be // modified by calling one of the first four above listed member functions with // an optional argument of type Modifier. However, the modifier will be // ignored unless |lookahead == 0| holds. Due to constraints of the grammar, // this turns out not to be a problem in practice. See the // mozilla.dev.tech.js-engine.internals thread entitled 'Bug in the scanner?' // for more details: // https://groups.google.com/forum/?fromgroups=#!topic/mozilla.dev.tech.js-engine.internals/2JLH5jRcr7E). // // The methods seek() and tell() allow to rescan from a previous visited // location of the buffer. // class MOZ_STACK_CLASS TokenStream { // Unicode separators that are treated as line terminators, in addition to \n, \r. enum { LINE_SEPARATOR = 0x2028, PARA_SEPARATOR = 0x2029 }; static const size_t ntokens = 4; // 1 current + 2 lookahead, rounded // to power of 2 to avoid divmod by 3 static const unsigned maxLookahead = 2; static const unsigned ntokensMask = ntokens - 1; public: typedef Vector CharBuffer; TokenStream(ExclusiveContext* cx, const ReadOnlyCompileOptions& options, const char16_t* base, size_t length, StrictModeGetter* smg); ~TokenStream(); bool checkOptions(); // Accessors. const Token& currentToken() const { return tokens[cursor]; } bool isCurrentTokenType(TokenKind type) const { return currentToken().type == type; } const CharBuffer& getTokenbuf() const { return tokenbuf; } const char* getFilename() const { return filename; } unsigned getLineno() const { return lineno; } unsigned getColumn() const { return userbuf.offset() - linebase - 1; } bool getMutedErrors() const { return mutedErrors; } JSVersion versionNumber() const { return VersionNumber(options().version); } JSVersion versionWithFlags() const { return options().version; } PropertyName* currentName() const { if (isCurrentTokenType(TOK_YIELD)) return cx->names().yield; MOZ_ASSERT(isCurrentTokenType(TOK_NAME)); return currentToken().name(); } PropertyName* nextName() const { if (nextToken().type == TOK_YIELD) return cx->names().yield; MOZ_ASSERT(nextToken().type == TOK_NAME); return nextToken().name(); } bool isCurrentTokenAssignment() const { return TokenKindIsAssignment(currentToken().type); } // Flag methods. bool isEOF() const { return flags.isEOF; } bool sawOctalEscape() const { return flags.sawOctalEscape; } bool hadError() const { return flags.hadError; } // TokenStream-specific error reporters. bool reportError(unsigned errorNumber, ...); bool reportErrorNoOffset(unsigned errorNumber, ...); bool reportWarning(unsigned errorNumber, ...); static const uint32_t NoOffset = UINT32_MAX; // General-purpose error reporters. You should avoid calling these // directly, and instead use the more succinct alternatives (e.g. // reportError()) in TokenStream, Parser, and BytecodeEmitter. bool reportCompileErrorNumberVA(uint32_t offset, unsigned flags, unsigned errorNumber, va_list args); bool reportStrictModeErrorNumberVA(uint32_t offset, bool strictMode, unsigned errorNumber, va_list args); bool reportStrictWarningErrorNumberVA(uint32_t offset, unsigned errorNumber, va_list args); // asm.js reporter void reportAsmJSError(uint32_t offset, unsigned errorNumber, ...); JSAtom* getRawTemplateStringAtom() { MOZ_ASSERT(currentToken().type == TOK_TEMPLATE_HEAD || currentToken().type == TOK_NO_SUBS_TEMPLATE); const char16_t* cur = userbuf.rawCharPtrAt(currentToken().pos.begin + 1); const char16_t* end; if (currentToken().type == TOK_TEMPLATE_HEAD) { // Of the form |`...${| or |}...${| end = userbuf.rawCharPtrAt(currentToken().pos.end - 2); } else { // NO_SUBS_TEMPLATE is of the form |`...`| or |}...`| end = userbuf.rawCharPtrAt(currentToken().pos.end - 1); } CharBuffer charbuf(cx); while (cur < end) { int32_t ch = *cur; if (ch == '\r') { ch = '\n'; if ((cur + 1 < end) && (*(cur + 1) == '\n')) cur++; } if (!charbuf.append(ch)) return nullptr; cur++; } return AtomizeChars(cx, charbuf.begin(), charbuf.length()); } private: // These are private because they should only be called by the tokenizer // while tokenizing not by, for example, BytecodeEmitter. bool reportStrictModeError(unsigned errorNumber, ...); bool strictMode() const { return strictModeGetter && strictModeGetter->strictMode(); } static JSAtom* atomize(ExclusiveContext* cx, CharBuffer& cb); bool putIdentInTokenbuf(const char16_t* identStart); struct Flags { bool isEOF:1; // Hit end of file. bool isDirtyLine:1; // Non-whitespace since start of line. bool sawOctalEscape:1; // Saw an octal character escape. bool hadError:1; // Hit a syntax error, at start or during a // token. bool hitOOM:1; // Hit OOM. Flags() : isEOF(), isDirtyLine(), sawOctalEscape(), hadError(), hitOOM() {} }; bool awaitIsKeyword = false; friend class AutoAwaitIsKeyword; public: typedef Token::Modifier Modifier; static MOZ_CONSTEXPR_VAR Modifier None = Token::None; static MOZ_CONSTEXPR_VAR Modifier Operand = Token::Operand; static MOZ_CONSTEXPR_VAR Modifier KeywordIsName = Token::KeywordIsName; static MOZ_CONSTEXPR_VAR Modifier TemplateTail = Token::TemplateTail; typedef Token::ModifierException ModifierException; static MOZ_CONSTEXPR_VAR ModifierException NoException = Token::NoException; static MOZ_CONSTEXPR_VAR ModifierException NoneIsOperand = Token::NoneIsOperand; static MOZ_CONSTEXPR_VAR ModifierException OperandIsNone = Token::OperandIsNone; static MOZ_CONSTEXPR_VAR ModifierException NoneIsKeywordIsName = Token::NoneIsKeywordIsName; void addModifierException(ModifierException modifierException) { #ifdef DEBUG const Token& next = nextToken(); if (next.modifierException == NoneIsOperand) { // Token after yield expression without operand already has // NoneIsOperand exception. MOZ_ASSERT(modifierException == OperandIsNone); MOZ_ASSERT(next.type != TOK_DIV, "next token requires contextual specifier to be parsed unambiguously"); // Do not update modifierException. return; } MOZ_ASSERT(next.modifierException == NoException); switch (modifierException) { case NoneIsOperand: MOZ_ASSERT(next.modifier == Operand); MOZ_ASSERT(next.type != TOK_DIV, "next token requires contextual specifier to be parsed unambiguously"); break; case OperandIsNone: MOZ_ASSERT(next.modifier == None); MOZ_ASSERT(next.type != TOK_DIV && next.type != TOK_REGEXP, "next token requires contextual specifier to be parsed unambiguously"); break; case NoneIsKeywordIsName: MOZ_ASSERT(next.modifier == KeywordIsName); MOZ_ASSERT(next.type != TOK_NAME); break; default: MOZ_CRASH("unexpected modifier exception"); } tokens[(cursor + 1) & ntokensMask].modifierException = modifierException; #endif } void verifyConsistentModifier(Modifier modifier, Token lookaheadToken) { #ifdef DEBUG // Easy case: modifiers match. if (modifier == lookaheadToken.modifier) return; if (lookaheadToken.modifierException == OperandIsNone) { // getToken(Operand) permissibly following getToken(). if (modifier == Operand && lookaheadToken.modifier == None) return; } if (lookaheadToken.modifierException == NoneIsOperand) { // getToken() permissibly following getToken(Operand). if (modifier == None && lookaheadToken.modifier == Operand) return; } if (lookaheadToken.modifierException == NoneIsKeywordIsName) { // getToken() permissibly following getToken(KeywordIsName). if (modifier == None && lookaheadToken.modifier == KeywordIsName) return; } MOZ_ASSERT_UNREACHABLE("this token was previously looked up with a " "different modifier, potentially making " "tokenization non-deterministic"); #endif } // Advance to the next token. If the token stream encountered an error, // return false. Otherwise return true and store the token kind in |*ttp|. bool getToken(TokenKind* ttp, Modifier modifier = None) { // Check for a pushed-back token resulting from mismatching lookahead. if (lookahead != 0) { MOZ_ASSERT(!flags.hadError); lookahead--; cursor = (cursor + 1) & ntokensMask; TokenKind tt = currentToken().type; MOZ_ASSERT(tt != TOK_EOL); verifyConsistentModifier(modifier, currentToken()); *ttp = tt; return true; } return getTokenInternal(ttp, modifier); } // Push the last scanned token back into the stream. void ungetToken() { MOZ_ASSERT(lookahead < maxLookahead); lookahead++; cursor = (cursor - 1) & ntokensMask; } bool peekToken(TokenKind* ttp, Modifier modifier = None) { if (lookahead > 0) { MOZ_ASSERT(!flags.hadError); verifyConsistentModifier(modifier, nextToken()); *ttp = nextToken().type; return true; } if (!getTokenInternal(ttp, modifier)) return false; ungetToken(); return true; } bool peekTokenPos(TokenPos* posp, Modifier modifier = None) { if (lookahead == 0) { TokenKind tt; if (!getTokenInternal(&tt, modifier)) return false; ungetToken(); MOZ_ASSERT(hasLookahead()); } else { MOZ_ASSERT(!flags.hadError); verifyConsistentModifier(modifier, nextToken()); } *posp = nextToken().pos; return true; } // This is like peekToken(), with one exception: if there is an EOL // between the end of the current token and the start of the next token, it // return true and store TOK_EOL in |*ttp|. In that case, no token with // TOK_EOL is actually created, just a TOK_EOL TokenKind is returned, and // currentToken() shouldn't be consulted. (This is the only place TOK_EOL // is produced.) MOZ_ALWAYS_INLINE bool peekTokenSameLine(TokenKind* ttp, Modifier modifier = None) { const Token& curr = currentToken(); // If lookahead != 0, we have scanned ahead at least one token, and // |lineno| is the line that the furthest-scanned token ends on. If // it's the same as the line that the current token ends on, that's a // stronger condition than what we are looking for, and we don't need // to return TOK_EOL. if (lookahead != 0) { bool onThisLine; if (!srcCoords.isOnThisLine(curr.pos.end, lineno, &onThisLine)) return reportError(JSMSG_OUT_OF_MEMORY); if (onThisLine) { MOZ_ASSERT(!flags.hadError); verifyConsistentModifier(modifier, nextToken()); *ttp = nextToken().type; return true; } } // The above check misses two cases where we don't have to return // TOK_EOL. // - The next token starts on the same line, but is a multi-line token. // - The next token starts on the same line, but lookahead==2 and there // is a newline between the next token and the one after that. // The following test is somewhat expensive but gets these cases (and // all others) right. TokenKind tmp; if (!getToken(&tmp, modifier)) return false; const Token& next = currentToken(); ungetToken(); *ttp = srcCoords.lineNum(curr.pos.end) == srcCoords.lineNum(next.pos.begin) ? next.type : TOK_EOL; return true; } // Get the next token from the stream if its kind is |tt|. bool matchToken(bool* matchedp, TokenKind tt, Modifier modifier = None) { TokenKind token; if (!getToken(&token, modifier)) return false; if (token == tt) { *matchedp = true; } else { ungetToken(); *matchedp = false; } return true; } void consumeKnownToken(TokenKind tt, Modifier modifier = None) { bool matched; MOZ_ASSERT(hasLookahead()); MOZ_ALWAYS_TRUE(matchToken(&matched, tt, modifier)); MOZ_ALWAYS_TRUE(matched); } // Like matchToken(..., TOK_NAME) but further matching the name token only // if it has the given characters, without containing escape sequences. // If the name token has the given characters yet *does* contain an escape, // a syntax error will be reported. // // This latter behavior makes this method unsuitable for use in any context // where ASI might occur. In such places, an escaped "contextual keyword" // on a new line is the start of an ExpressionStatement, not a continuation // of a StatementListItem (or ImportDeclaration or ExportDeclaration, in // modules). bool matchContextualKeyword(bool* matchedp, Handle keyword, Modifier modifier = None) { TokenKind token; if (!getToken(&token, modifier)) return false; if (token == TOK_NAME && currentToken().name() == keyword) { if (currentToken().nameContainsEscape()) { reportError(JSMSG_ESCAPED_KEYWORD); return false; } *matchedp = true; } else { *matchedp = false; ungetToken(); } return true; } bool nextTokenEndsExpr(bool* endsExpr) { TokenKind tt; if (!peekToken(&tt)) return false; *endsExpr = isExprEnding[tt]; return true; } class MOZ_STACK_CLASS Position { public: // The Token fields may contain pointers to atoms, so for correct // rooting we must ensure collection of atoms is disabled while objects // of this class are live. Do this by requiring a dummy AutoKeepAtoms // reference in the constructor. // // This class is explicity ignored by the analysis, so don't add any // more pointers to GC things here! explicit Position(AutoKeepAtoms&) { } private: Position(const Position&) = delete; friend class TokenStream; const char16_t* buf; Flags flags; unsigned lineno; size_t linebase; size_t prevLinebase; Token currentToken; unsigned lookahead; Token lookaheadTokens[maxLookahead]; }; bool advance(size_t position); void tell(Position*); void seek(const Position& pos); bool seek(const Position& pos, const TokenStream& other); #ifdef DEBUG inline bool debugHasNoLookahead() const { return lookahead == 0; } #endif const char16_t* rawCharPtrAt(size_t offset) const { return userbuf.rawCharPtrAt(offset); } const char16_t* rawLimit() const { return userbuf.limit(); } bool hasDisplayURL() const { return displayURL_ != nullptr; } char16_t* displayURL() { return displayURL_.get(); } bool hasSourceMapURL() const { return sourceMapURL_ != nullptr; } char16_t* sourceMapURL() { return sourceMapURL_.get(); } // If |atom| is not a keyword in this version, return true with *ttp // unchanged. // // If it is a reserved word in this version and strictness mode, and thus // can't be present in correct code, report a SyntaxError and return false. // // If it is a keyword, like "if", the behavior depends on ttp. If ttp is // null, report a SyntaxError ("if is a reserved identifier") and return // false. If ttp is non-null, return true with the keyword's TokenKind in // *ttp. bool checkForKeyword(JSAtom* atom, TokenKind* ttp); // Same semantics as above, but for the provided keyword. bool checkForKeyword(const KeywordInfo* kw, TokenKind* ttp); // This class maps a userbuf offset (which is 0-indexed) to a line number // (which is 1-indexed) and a column index (which is 0-indexed). class SourceCoords { // For a given buffer holding source code, |lineStartOffsets_| has one // element per line of source code, plus one sentinel element. Each // non-sentinel element holds the buffer offset for the start of the // corresponding line of source code. For this example script: // // 1 // xyz [line starts at offset 0] // 2 var x; [line starts at offset 7] // 3 [line starts at offset 14] // 4 var y; [line starts at offset 15] // // |lineStartOffsets_| is: // // [0, 7, 14, 15, MAX_PTR] // // To convert a "line number" to a "line index" (i.e. an index into // |lineStartOffsets_|), subtract |initialLineNum_|. E.g. line 3's // line index is (3 - initialLineNum_), which is 2. Therefore // lineStartOffsets_[2] holds the buffer offset for the start of line 3, // which is 14. (Note that |initialLineNum_| is often 1, but not // always.) // // The first element is always 0, and the last element is always the // MAX_PTR sentinel. // // offset-to-line/column lookups are O(log n) in the worst case (binary // search), but in practice they're heavily clustered and we do better // than that by using the previous lookup's result (lastLineIndex_) as // a starting point. // // Checking if an offset lies within a particular line number // (isOnThisLine()) is O(1). // Vector lineStartOffsets_; uint32_t initialLineNum_; // This is mutable because it's modified on every search, but that fact // isn't visible outside this class. mutable uint32_t lastLineIndex_; uint32_t lineIndexOf(uint32_t offset) const; static const uint32_t MAX_PTR = UINT32_MAX; uint32_t lineIndexToNum(uint32_t lineIndex) const { return lineIndex + initialLineNum_; } uint32_t lineNumToIndex(uint32_t lineNum) const { return lineNum - initialLineNum_; } public: SourceCoords(ExclusiveContext* cx, uint32_t ln); bool add(uint32_t lineNum, uint32_t lineStartOffset); bool fill(const SourceCoords& other); bool isOnThisLine(uint32_t offset, uint32_t lineNum, bool* onThisLine) const { uint32_t lineIndex = lineNumToIndex(lineNum); if (lineIndex + 1 >= lineStartOffsets_.length()) // +1 due to sentinel return false; *onThisLine = lineStartOffsets_[lineIndex] <= offset && offset < lineStartOffsets_[lineIndex + 1]; return true; } uint32_t lineNum(uint32_t offset) const; uint32_t columnIndex(uint32_t offset) const; void lineNumAndColumnIndex(uint32_t offset, uint32_t* lineNum, uint32_t* columnIndex) const; }; SourceCoords srcCoords; JSAtomState& names() const { return cx->names(); } ExclusiveContext* context() const { return cx; } const ReadOnlyCompileOptions& options() const { return options_; } private: // This is the low-level interface to the JS source code buffer. It just // gets raw chars, basically. TokenStreams functions are layered on top // and do some extra stuff like converting all EOL sequences to '\n', // tracking the line number, and setting |flags.isEOF|. (The "raw" in "raw // chars" refers to the lack of EOL sequence normalization.) // // buf[0..length-1] often represents a substring of some larger source, // where we have only the substring in memory. The |startOffset| argument // indicates the offset within this larger string at which our string // begins, the offset of |buf[0]|. class TokenBuf { public: TokenBuf(ExclusiveContext* cx, const char16_t* buf, size_t length, size_t startOffset) : base_(buf), startOffset_(startOffset), limit_(buf + length), ptr(buf) { } bool hasRawChars() const { return ptr < limit_; } bool atStart() const { return offset() == 0; } size_t startOffset() const { return startOffset_; } size_t offset() const { return startOffset_ + mozilla::PointerRangeSize(base_, ptr); } const char16_t* rawCharPtrAt(size_t offset) const { MOZ_ASSERT(startOffset_ <= offset); MOZ_ASSERT(offset - startOffset_ <= mozilla::PointerRangeSize(base_, limit_)); return base_ + (offset - startOffset_); } const char16_t* limit() const { return limit_; } char16_t getRawChar() { return *ptr++; // this will nullptr-crash if poisoned } char16_t peekRawChar() const { return *ptr; // this will nullptr-crash if poisoned } bool matchRawChar(char16_t c) { if (*ptr == c) { // this will nullptr-crash if poisoned ptr++; return true; } return false; } bool matchRawCharBackwards(char16_t c) { MOZ_ASSERT(ptr); // make sure it hasn't been poisoned if (*(ptr - 1) == c) { ptr--; return true; } return false; } void ungetRawChar() { MOZ_ASSERT(ptr); // make sure it hasn't been poisoned ptr--; } const char16_t* addressOfNextRawChar(bool allowPoisoned = false) const { MOZ_ASSERT_IF(!allowPoisoned, ptr); // make sure it hasn't been poisoned return ptr; } // Use this with caution! void setAddressOfNextRawChar(const char16_t* a, bool allowPoisoned = false) { MOZ_ASSERT_IF(!allowPoisoned, a); ptr = a; } #ifdef DEBUG // Poison the TokenBuf so it cannot be accessed again. void poison() { ptr = nullptr; } #endif static bool isRawEOLChar(int32_t c) { return c == '\n' || c == '\r' || c == LINE_SEPARATOR || c == PARA_SEPARATOR; } // Returns the offset of the next EOL, but stops once 'max' characters // have been scanned (*including* the char at startOffset_). size_t findEOLMax(size_t start, size_t max); private: const char16_t* base_; // base of buffer uint32_t startOffset_; // offset of base_[0] const char16_t* limit_; // limit for quick bounds check const char16_t* ptr; // next char to get }; bool getTokenInternal(TokenKind* ttp, Modifier modifier); bool getBracedUnicode(uint32_t* code); bool getStringOrTemplateToken(int untilChar, Token** tp); int32_t getChar(); int32_t getCharIgnoreEOL(); void ungetChar(int32_t c); void ungetCharIgnoreEOL(int32_t c); Token* newToken(ptrdiff_t adjust); bool peekUnicodeEscape(int32_t* c); bool matchUnicodeEscapeIdStart(int32_t* c); bool matchUnicodeEscapeIdent(int32_t* c); bool peekChars(int n, char16_t* cp); bool getDirectives(bool isMultiline, bool shouldWarnDeprecated); bool getDirective(bool isMultiline, bool shouldWarnDeprecated, const char* directive, int directiveLength, const char* errorMsgPragma, mozilla::UniquePtr* destination); bool getDisplayURL(bool isMultiline, bool shouldWarnDeprecated); bool getSourceMappingURL(bool isMultiline, bool shouldWarnDeprecated); // |expect| cannot be an EOL char. bool matchChar(int32_t expect) { MOZ_ASSERT(!TokenBuf::isRawEOLChar(expect)); return MOZ_LIKELY(userbuf.hasRawChars()) && userbuf.matchRawChar(expect); } void consumeKnownChar(int32_t expect) { mozilla::DebugOnly c = getChar(); MOZ_ASSERT(c == expect); } int32_t peekChar() { int32_t c = getChar(); ungetChar(c); return c; } void skipChars(int n) { while (--n >= 0) getChar(); } void updateLineInfoForEOL(); void updateFlagsForEOL(); const Token& nextToken() const { MOZ_ASSERT(hasLookahead()); return tokens[(cursor + 1) & ntokensMask]; } bool hasLookahead() const { return lookahead > 0; } // Options used for parsing/tokenizing. const ReadOnlyCompileOptions& options_; Token tokens[ntokens]; // circular token buffer unsigned cursor; // index of last parsed token unsigned lookahead; // count of lookahead tokens unsigned lineno; // current line number Flags flags; // flags -- see above size_t linebase; // start of current line size_t prevLinebase; // start of previous line; size_t(-1) if on the first line TokenBuf userbuf; // user input buffer const char* filename; // input filename or null mozilla::UniquePtr displayURL_; // the user's requested source URL or null mozilla::UniquePtr sourceMapURL_; // source map's filename or null CharBuffer tokenbuf; // current token string buffer uint8_t isExprEnding[TOK_LIMIT];// which tokens definitely terminate exprs? ExclusiveContext* const cx; bool mutedErrors; StrictModeGetter* strictModeGetter; // used to test for strict mode }; class MOZ_STACK_CLASS AutoAwaitIsKeyword { private: TokenStream* ts_; bool oldAwaitIsKeyword_; public: AutoAwaitIsKeyword(TokenStream* ts, bool awaitIsKeyword) { ts_ = ts; oldAwaitIsKeyword_ = ts_->awaitIsKeyword; ts_->awaitIsKeyword = awaitIsKeyword; } ~AutoAwaitIsKeyword() { ts_->awaitIsKeyword = oldAwaitIsKeyword_; ts_ = nullptr; } }; // Steal one JSREPORT_* bit (see jsapi.h) to tell that arguments to the error // message have const char16_t* type, not const char*. #define JSREPORT_UC 0x100 extern const char* TokenKindToDesc(TokenKind tt); } // namespace frontend } // namespace js extern JS_FRIEND_API(int) js_fgets(char* buf, int size, FILE* file); #ifdef DEBUG extern const char* TokenKindToString(js::frontend::TokenKind tt); #endif #endif /* frontend_TokenStream_h */