From c05db4075d27189cb653cea2553c48333d5a879e Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Fri, 7 Aug 2015 08:12:51 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 8: Disallow extended pattern in RegExp with unicode flag. r=till, f=anba --- js/src/irregexp/RegExpParser.cpp | 95 ++++++++++++-- js/src/js.msg | 4 + .../RegExp/unicode-disallow-extended.js | 117 ++++++++++++++++++ js/src/vm/Xdr.h | 4 +- 4 files changed, 207 insertions(+), 13 deletions(-) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 52ee9942c..f5ecda260 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -405,6 +405,31 @@ NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) return alloc->newInfallible(RangeAtom(alloc, from, to), false, 0, 0); } +static bool +IsSyntaxCharacter(widechar c) +{ + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + return true; + default: + return false; + } +} + #ifdef DEBUG // Currently only used in an assert.kASSERT. static bool @@ -459,16 +484,23 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) widechar controlLetter = Next(); widechar letter = controlLetter & ~('A' ^ 'a'); // For compatibility with JSC, inside a character class - // we also accept digits and underscore as control characters. - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_' || - (letter >= 'A' && letter <= 'Z')) { + // we also accept digits and underscore as control characters, + // but only in non-unicode mode + if ((!unicode_ && + ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_')) || + (letter >= 'A' && letter <= 'Z')) + { Advance(2); // Control letters mapped to ASCII control characters in the range // 0x00-0x1f. *code = controlLetter & 0x1f; return true; } + if (unicode_) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. *code = '\\'; @@ -476,9 +508,18 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - // For compatibility, we interpret a decimal escape that isn't - // a back reference (and therefore either \0 or not valid according - // to the specification) as a 1..3 digit octal character code. + if (unicode_) { + if (current() == '0') { + *code = 0; + return true; + } + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } + // For compatibility, outside of unicode mode, we interpret a decimal + // escape that isn't a back reference (and therefore either \0 or not + // valid according to the specification) as a 1..3 digit octal + // character code. *code = ParseOctalLiteral(); return true; case 'x': { @@ -488,8 +529,12 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) *code = value; return true; } + if (unicode_) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } // If \x is not followed by a two-digit hexadecimal, treat it - // as an identity escape. + // as an identity escape in non-unicode mode. *code = 'x'; return true; } @@ -527,10 +572,14 @@ RegExpParser::ParseClassCharacterEscape(widechar* code) return true; } default: { - // Extended identity escape. We accept any character that hasn't - // been matched by a more specific case, not just the subset required - // by the ECMAScript specification. + // Extended identity escape (non-unicode only). We accept any character + // that hasn't been matched by a more specific case, not just the subset + // required by the ECMAScript specification. widechar result = current(); + if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) { + ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + return false; + } Advance(); *code = result; return true; @@ -1388,6 +1437,8 @@ RegExpParser::ParseDisjunction() capture_index); } builder->AddAtom(body); + if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD)) + continue; // For compatability with JSC and ES3, we allow quantifiers after // lookaheads, and break in all cases. break; @@ -1527,6 +1578,8 @@ RegExpParser::ParseDisjunction() builder->AddAtom(atom); break; } + if (unicode_) + return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE); widechar first_digit = Next(); if (first_digit == '8' || first_digit == '9') { // Treat as identity escape @@ -1537,6 +1590,14 @@ RegExpParser::ParseDisjunction() } // FALLTHROUGH case '0': { + if (unicode_) { + Advance(2); + if (IsDecimalDigit(current())) + return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE); + builder->AddCharacter(0); + break; + } + Advance(); size_t octal = ParseOctalLiteral(); builder->AddCharacter(octal); @@ -1571,6 +1632,8 @@ RegExpParser::ParseDisjunction() // Convert lower case letters to uppercase. widechar letter = controlLetter & ~('a' ^ 'A'); if (letter < 'A' || 'Z' < letter) { + if (unicode_) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); // controlLetter is not in range 'A'-'Z' or 'a'-'z'. // This is outside the specification. We match JSC in // reading the backslash as a literal character instead @@ -1588,6 +1651,8 @@ RegExpParser::ParseDisjunction() if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); } else { + if (unicode_) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); builder->AddCharacter('x'); } break; @@ -1639,12 +1704,16 @@ RegExpParser::ParseDisjunction() } default: // Identity escape. + if (unicode_ && !IsSyntaxCharacter(Next())) + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); builder->AddCharacter(Next()); Advance(2); break; } break; case '{': { + if (unicode_) + return ReportError(JSMSG_RAW_BRACE_IN_REGEP); int dummy; if (ParseIntervalQuantifier(&dummy, &dummy)) return ReportError(JSMSG_NOTHING_TO_REPEAT); @@ -1661,6 +1730,10 @@ RegExpParser::ParseDisjunction() builder->AddAtom(LeadSurrogateAtom(alloc, c)); else if (unicode::IsTrailSurrogate(c)) builder->AddAtom(TrailSurrogateAtom(alloc, c)); + else if (c == ']') + return ReportError(JSMSG_RAW_BRACKET_IN_REGEP); + else if (c == '}') + return ReportError(JSMSG_RAW_BRACE_IN_REGEP); else builder->AddCharacter(c); Advance(); diff --git a/js/src/js.msg b/js/src/js.msg index 5b163ef3a..a8e94a693 100644 --- a/js/src/js.msg +++ b/js/src/js.msg @@ -447,8 +447,10 @@ MSG_DEF(JSMSG_INVALID_TIME_ZONE, 1, JSEXN_RANGEERR, "invalid time zone in MSG_DEF(JSMSG_UNDEFINED_CURRENCY, 0, JSEXN_TYPEERR, "undefined currency in NumberFormat() with currency style") // RegExp +MSG_DEF(JSMSG_BACK_REF_OUT_OF_RANGE, 0, JSEXN_SYNTAXERR, "back reference out of range in regular expression") MSG_DEF(JSMSG_BAD_CLASS_RANGE, 0, JSEXN_SYNTAXERR, "invalid range in character class") MSG_DEF(JSMSG_ESCAPE_AT_END_OF_REGEXP, 0, JSEXN_SYNTAXERR, "\\ at end of pattern") +MSG_DEF(JSMSG_INVALID_DECIMAL_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid decimal escape in regular expression") MSG_DEF(JSMSG_INVALID_GROUP, 0, JSEXN_SYNTAXERR, "invalid regexp group") MSG_DEF(JSMSG_INVALID_IDENTITY_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid identity escape in regular expression") MSG_DEF(JSMSG_INVALID_UNICODE_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid unicode escape in regular expression") @@ -457,6 +459,8 @@ MSG_DEF(JSMSG_NEWREGEXP_FLAGGED, 0, JSEXN_TYPEERR, "can't supply flags whe MSG_DEF(JSMSG_NOTHING_TO_REPEAT, 0, JSEXN_SYNTAXERR, "nothing to repeat") MSG_DEF(JSMSG_NUMBERS_OUT_OF_ORDER, 0, JSEXN_SYNTAXERR, "numbers out of order in {} quantifier.") MSG_DEF(JSMSG_RANGE_WITH_CLASS_ESCAPE, 0, JSEXN_SYNTAXERR, "character class escape cannot be used in class range in regular expression") +MSG_DEF(JSMSG_RAW_BRACE_IN_REGEP, 0, JSEXN_SYNTAXERR, "raw brace is not allowed in regular expression with unicode flag") +MSG_DEF(JSMSG_RAW_BRACKET_IN_REGEP, 0, JSEXN_SYNTAXERR, "raw bracket is not allowed in regular expression with unicode flag") MSG_DEF(JSMSG_TOO_MANY_PARENS, 0, JSEXN_INTERNALERR, "too many parentheses in regular expression") MSG_DEF(JSMSG_UNICODE_OVERFLOW, 0, JSEXN_SYNTAXERR, "unicode codepoint should not be greater than 0x10FFFF in regular expression") MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN, 0, JSEXN_SYNTAXERR, "unmatched ) in regular expression") diff --git a/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js b/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js new file mode 100644 index 000000000..d1f775fac --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js @@ -0,0 +1,117 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- disallow extended patterns."; + +print(BUGNUMBER + ": " + summary); + +// IdentityEscape + +assertEqArray(/\^\$\\\.\*\+\?\(\)\[\]\{\}\|/u.exec("^$\\.*+?()[]{}|"), + ["^$\\.*+?()[]{}|"]); +assertThrowsInstanceOf(() => eval(`/\\A/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\-/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\U{10}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\U0000/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\U0000/u`), SyntaxError); + +assertEqArray(/[\^\$\\\.\*\+\?\(\)\[\]\{\}\|]+/u.exec("^$\\.*+?()[]{}|"), + ["^$\\.*+?()[]{}|"]); +assertThrowsInstanceOf(() => eval(`/[\\A]/u`), SyntaxError); +assertEqArray(/[A\-Z]+/u.exec("a-zABC"), + ["-"]); +assertThrowsInstanceOf(() => eval(`/[\\U{10}]/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/[\\U0000]/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/[\\uD83D\\U0000]/u`), SyntaxError); + +// PatternCharacter +assertThrowsInstanceOf(() => eval(`/{}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/{/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/]/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/{0}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/{1,}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/{1,2}/u`), SyntaxError); + +// QuantifiableAssertion +assertEqArray(/.B(?=A)/u.exec("cBaCBA"), + ["CB"]); +assertEqArray(/.B(?!A)/u.exec("CBAcBa"), + ["cB"]); +assertEqArray(/.B(?:A)/u.exec("cBaCBA"), + ["CBA"]); +assertEqArray(/.B(A)/u.exec("cBaCBA"), + ["CBA", "A"]); + +assertThrowsInstanceOf(() => eval(`/.B(?=A)+/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/.B(?!A)+/u`), SyntaxError); +assertEqArray(/.B(?:A)+/u.exec("cBaCBA"), + ["CBA"]); +assertEqArray(/.B(A)+/u.exec("cBaCBA"), + ["CBA", "A"]); + +// ControlLetter +assertEqArray(/\cA/u.exec("\u0001"), + ["\u0001"]); +assertEqArray(/\cZ/u.exec("\u001a"), + ["\u001a"]); +assertEqArray(/\ca/u.exec("\u0001"), + ["\u0001"]); +assertEqArray(/\cz/u.exec("\u001a"), + ["\u001a"]); + +assertEqArray(/[\cA]/u.exec("\u0001"), + ["\u0001"]); +assertEqArray(/[\cZ]/u.exec("\u001a"), + ["\u001a"]); +assertEqArray(/[\ca]/u.exec("\u0001"), + ["\u0001"]); +assertEqArray(/[\cz]/u.exec("\u001a"), + ["\u001a"]); + +assertThrowsInstanceOf(() => eval(`/\\c/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\c1/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\c_/u`), SyntaxError); + +assertThrowsInstanceOf(() => eval(`/[\\c]/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/[\\c1]/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/[\\c_]/u`), SyntaxError); + +// HexEscapeSequence +assertThrowsInstanceOf(() => eval(`/\\x/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\x0/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\x1/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\x1G/u`), SyntaxError); + +// LegacyOctalEscapeSequence +assertThrowsInstanceOf(() => eval(`/\\52/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\052/u`), SyntaxError); + +// DecimalEscape +assertEqArray(/\0/u.exec("\0"), + ["\0"]); +assertEqArray(/[\0]/u.exec("\0"), + ["\0"]); +assertEqArray(/\0A/u.exec("\0A"), + ["\0A"]); +assertEqArray(/\0G/u.exec("\0G"), + ["\0G"]); +assertEqArray(/(A.)\1/u.exec("ABACABAB"), + ["ABAB", "AB"]); +assertEqArray(/(A.)(B.)(C.)(D.)(E.)(F.)(G.)(H.)(I.)(J.)(K.)\10/u.exec("A1B2C3D4E5F6G7H8I9JaKbJa"), + ["A1B2C3D4E5F6G7H8I9JaKbJa", "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "Ja", "Kb"]); + +assertThrowsInstanceOf(() => eval(`/\\00/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\01/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\09/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\1/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\2/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\3/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\4/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\5/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\6/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\7/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\8/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\9/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\10/u`), SyntaxError); + +if (typeof reportCompare === "function") + reportCompare(true, true); diff --git a/js/src/vm/Xdr.h b/js/src/vm/Xdr.h index b22324cdf..0b3262408 100644 --- a/js/src/vm/Xdr.h +++ b/js/src/vm/Xdr.h @@ -29,11 +29,11 @@ namespace js { * * https://developer.mozilla.org/en-US/docs/SpiderMonkey/Internals/Bytecode */ -static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 334; +static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 335; static const uint32_t XDR_BYTECODE_VERSION = uint32_t(0xb973c0de - XDR_BYTECODE_VERSION_SUBTRAHEND); -static_assert(JSErr_Limit == 429, +static_assert(JSErr_Limit == 433, "GREETINGS, POTENTIAL SUBTRAHEND INCREMENTER! If you added or " "removed MSG_DEFs from js.msg, you should increment " "XDR_BYTECODE_VERSION_SUBTRAHEND and update this assertion's "