From c85a176bd855bfafa58ba297a747a742c29ab446 Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Fri, 7 Aug 2015 08:11:07 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 2: Parse RegExp unicode character in non-CharacterClass. r=till, f=anba --- js/src/irregexp/RegExpAST.h | 3 +- js/src/irregexp/RegExpEngine.cpp | 30 +++ js/src/irregexp/RegExpEngine.h | 7 +- js/src/irregexp/RegExpParser.cpp | 179 ++++++++++++++ js/src/irregexp/RegExpParser.h | 4 + js/src/js.msg | 3 + js/src/tests/ecma_6/RegExp/unicode-braced.js | 166 +++++++++++++ .../tests/ecma_6/RegExp/unicode-lead-trail.js | 218 ++++++++++++++++++ js/src/tests/ecma_6/RegExp/unicode-raw.js | 139 +++++++++++ js/src/vm/Unicode.h | 38 +++ js/src/vm/Xdr.h | 4 +- 11 files changed, 787 insertions(+), 4 deletions(-) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-braced.js create mode 100644 js/src/tests/ecma_6/RegExp/unicode-lead-trail.js create mode 100644 js/src/tests/ecma_6/RegExp/unicode-raw.js diff --git a/js/src/irregexp/RegExpAST.h b/js/src/irregexp/RegExpAST.h index b7e27a4f3..1bafe91d1 100644 --- a/js/src/irregexp/RegExpAST.h +++ b/js/src/irregexp/RegExpAST.h @@ -138,7 +138,8 @@ class RegExpAssertion : public RegExpTree { END_OF_LINE, END_OF_INPUT, BOUNDARY, - NON_BOUNDARY + NON_BOUNDARY, + NOT_AFTER_LEAD_SURROGATE }; explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { } virtual void* Accept(RegExpVisitor* visitor, void* data); diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 84dab0c8a..731483442 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -2061,6 +2061,8 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler, result->AddAlternative(end_alternative); return result; } + case NOT_AFTER_LEAD_SURROGATE: + return AssertionNode::NotAfterLeadSurrogate(on_success); default: MOZ_CRASH("Bad assertion type"); } @@ -2848,6 +2850,31 @@ EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) on_success->Emit(compiler, &new_trace); } +// Assert that the next character cannot be a part of a surrogate pair. +static void +EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) +{ + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + // We will be loading the previous character into the current character + // register. + Trace new_trace(*trace); + new_trace.InvalidateCurrentCharacter(); + + jit::Label ok; + if (new_trace.cp_offset() == 0) + assembler->CheckAtStart(&ok); + + // We already checked that we are not at the start of input so it must be + // OK to load the previous character. + assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false); + assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax, + new_trace.backtrack()); + + assembler->Bind(&ok); + on_success->Emit(compiler, &new_trace); +} + // Check for [0-9A-Z_a-z]. static void EmitWordCheck(RegExpMacroAssembler* assembler, @@ -3001,6 +3028,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) EmitBoundaryCheck(compiler, trace); return; } + case NOT_AFTER_LEAD_SURROGATE: + EmitNotAfterLeadSurrogate(compiler, on_success(), trace); + return; } on_success()->Emit(compiler, trace); } diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h index b687c6c52..bf5766086 100644 --- a/js/src/irregexp/RegExpEngine.h +++ b/js/src/irregexp/RegExpEngine.h @@ -788,7 +788,8 @@ class AssertionNode : public SeqRegExpNode AT_START, AT_BOUNDARY, AT_NON_BOUNDARY, - AFTER_NEWLINE + AFTER_NEWLINE, + NOT_AFTER_LEAD_SURROGATE }; AssertionNode(AssertionType t, RegExpNode* on_success) : SeqRegExpNode(on_success), assertion_type_(t) @@ -809,6 +810,10 @@ class AssertionNode : public SeqRegExpNode static AssertionNode* AfterNewline(RegExpNode* on_success) { return on_success->alloc()->newInfallible(AFTER_NEWLINE, on_success); } + static AssertionNode* NotAfterLeadSurrogate(RegExpNode* on_success) { + return on_success->alloc()->newInfallible(NOT_AFTER_LEAD_SURROGATE, + on_success); + } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 6ab53c59f..f444e3cde 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -302,6 +302,108 @@ RegExpParser::ParseHexEscape(int length, size_t* value) return true; } +template +bool +RegExpParser::ParseBracedHexEscape(size_t* value) +{ + MOZ_ASSERT(current() == '{'); + Advance(); + + bool first = true; + uint32_t code = 0; + while (true) { + widechar c = current(); + if (c == kEndMarker) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + if (c == '}') { + if (first) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + Advance(); + break; + } + + int d = HexValue(c); + if (d < 0) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return false; + } + code = (code << 4) | d; + if (code > unicode::NonBMPMax) { + ReportError(JSMSG_UNICODE_OVERFLOW); + return false; + } + Advance(); + first = false; + } + + *value = code; + return true; +} + +template +bool +RegExpParser::ParseTrailSurrogate(size_t* value) +{ + if (current() != '\\') + return false; + + const CharT* start = position(); + Advance(); + if (current() != 'u') { + Reset(start); + return false; + } + Advance(); + if (!ParseHexEscape(4, value)) { + Reset(start); + return false; + } + if (!unicode::IsTrailSurrogate(*value)) { + Reset(start); + return false; + } + return true; +} + +template +bool +RegExpParser::ParseRawSurrogatePair(char16_t* lead, char16_t* trail) +{ + widechar c1 = current(); + if (!unicode::IsLeadSurrogate(c1)) + return false; + + const CharT* start = position(); + Advance(); + widechar c2 = current(); + if (!unicode::IsTrailSurrogate(c2)) { + Reset(start); + return false; + } + Advance(); + *lead = c1; + *trail = c2; + return true; +} + +static inline RegExpTree* +RangeAtom(LifoAlloc* alloc, char16_t from, char16_t to) +{ + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(from, to)); + return alloc->newInfallible(ranges, false); +} + +static inline RegExpTree* +NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to) +{ + return alloc->newInfallible(RangeAtom(alloc, from, to), false, 0, 0); +} + #ifdef DEBUG // Currently only used in an assert.kASSERT. static bool @@ -675,6 +777,35 @@ RegExpParser::ParsePattern() return result; } +static inline RegExpTree* +SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + builder->AddCharacter(lead); + builder->AddCharacter(trail); + return builder->ToRegExp(); +} + +static inline RegExpTree* +LeadSurrogateAtom(LifoAlloc* alloc, char16_t value) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + builder->AddCharacter(value); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + return builder->ToRegExp(); +} + +static inline RegExpTree* +TrailSurrogateAtom(LifoAlloc* alloc, char16_t value) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddCharacter(value); + return builder->ToRegExp(); +} + // Disjunction :: // Alternative // Alternative | Disjunction @@ -929,6 +1060,38 @@ RegExpParser::ParseDisjunction() case 'u': { Advance(2); size_t value; + if (unicode_) { + if (current() == '{') { + if (!ParseBracedHexEscape(&value)) + return nullptr; + if (unicode::IsLeadSurrogate(value)) { + builder->AddAtom(LeadSurrogateAtom(alloc, value)); + } else if (unicode::IsTrailSurrogate(value)) { + builder->AddAtom(TrailSurrogateAtom(alloc, value)); + } else if (value >= unicode::NonBMPMin) { + size_t lead, trail; + unicode::UTF16Encode(value, &lead, &trail); + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail)); + } else { + builder->AddCharacter(value); + } + } else if (ParseHexEscape(4, &value)) { + if (unicode::IsLeadSurrogate(value)) { + size_t trail; + if (ParseTrailSurrogate(&trail)) + builder->AddAtom(SurrogatePairAtom(alloc, value, trail)); + else + builder->AddAtom(LeadSurrogateAtom(alloc, value)); + } else if (unicode::IsTrailSurrogate(value)) { + builder->AddAtom(TrailSurrogateAtom(alloc, value)); + } else { + builder->AddCharacter(value); + } + } else { + return ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + } + break; + } if (ParseHexEscape(4, &value)) { builder->AddCharacter(value); } else { @@ -950,6 +1113,22 @@ RegExpParser::ParseDisjunction() // fallthrough } default: + if (unicode_) { + char16_t lead, trail; + if (ParseRawSurrogatePair(&lead, &trail)) { + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail)); + } else { + widechar c = current(); + if (unicode::IsLeadSurrogate(c)) + builder->AddAtom(LeadSurrogateAtom(alloc, c)); + else if (unicode::IsTrailSurrogate(c)) + builder->AddAtom(TrailSurrogateAtom(alloc, c)); + else + builder->AddCharacter(c); + Advance(); + } + break; + } builder->AddCharacter(current()); Advance(); break; diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 740bff927..352c1d5fa 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -193,6 +193,10 @@ class RegExpParser // and sets the value if it is. bool ParseHexEscape(int length, size_t* value); + bool ParseBracedHexEscape(size_t* value); + bool ParseTrailSurrogate(size_t* value); + bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail); + size_t ParseOctalLiteral(); // Tries to parse the input as a back reference. If successful it diff --git a/js/src/js.msg b/js/src/js.msg index 0636c3639..b8ee6811c 100644 --- a/js/src/js.msg +++ b/js/src/js.msg @@ -450,11 +450,14 @@ MSG_DEF(JSMSG_UNDEFINED_CURRENCY, 0, JSEXN_TYPEERR, "undefined currency in MSG_DEF(JSMSG_BAD_CLASS_RANGE, 0, JSEXN_SYNTAXERR, "invalid range in character class") MSG_DEF(JSMSG_ESCAPE_AT_END_OF_REGEXP, 0, JSEXN_SYNTAXERR, "\\ at end of pattern") MSG_DEF(JSMSG_INVALID_GROUP, 0, JSEXN_SYNTAXERR, "invalid regexp group") +MSG_DEF(JSMSG_INVALID_IDENTITY_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid identity escape in regular expression") +MSG_DEF(JSMSG_INVALID_UNICODE_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid unicode escape in regular expression") MSG_DEF(JSMSG_MISSING_PAREN, 0, JSEXN_SYNTAXERR, "unterminated parenthetical") MSG_DEF(JSMSG_NEWREGEXP_FLAGGED, 0, JSEXN_TYPEERR, "can't supply flags when constructing one RegExp from another") MSG_DEF(JSMSG_NOTHING_TO_REPEAT, 0, JSEXN_SYNTAXERR, "nothing to repeat") MSG_DEF(JSMSG_NUMBERS_OUT_OF_ORDER, 0, JSEXN_SYNTAXERR, "numbers out of order in {} quantifier.") MSG_DEF(JSMSG_TOO_MANY_PARENS, 0, JSEXN_INTERNALERR, "too many parentheses in regular expression") +MSG_DEF(JSMSG_UNICODE_OVERFLOW, 0, JSEXN_SYNTAXERR, "unicode codepoint should not be greater than 0x10FFFF in regular expression") MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN, 0, JSEXN_SYNTAXERR, "unmatched ) in regular expression") MSG_DEF(JSMSG_UNTERM_CLASS, 0, JSEXN_SYNTAXERR, "unterminated character class") diff --git a/js/src/tests/ecma_6/RegExp/unicode-braced.js b/js/src/tests/ecma_6/RegExp/unicode-braced.js new file mode 100644 index 000000000..97df7acab --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-braced.js @@ -0,0 +1,166 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- braced pattern in RegExpUnicodeEscapeSequence."; + +print(BUGNUMBER + ": " + summary); + +// ==== standalone ==== + +assertEqArray(/\u{41}/u.exec("ABC"), + ["A"]); +assertEqArray(/\u{41}/.exec("ABC" + "u".repeat(41)), + ["u".repeat(41)]); + +assertEqArray(/\u{4A}/u.exec("JKL"), + ["J"]); +assertEqArray(/\u{4A}/.exec("JKLu{4A}"), + ["u{4A}"]); + +assertEqArray(/\u{1F438}/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\u{1F438}/.exec("u{1F438}"), + ["u{1F438}"]); + +assertEqArray(/\u{0}/u.exec("\u{0}"), + ["\u{0}"]); +assertEqArray(/\u{10FFFF}/u.exec("\u{10FFFF}"), + ["\u{10FFFF}"]); +assertEqArray(/\u{10ffff}/u.exec("\u{10FFFF}"), + ["\u{10FFFF}"]); + +// leading 0 +assertEqArray(/\u{0000000000000000000000}/u.exec("\u{0}"), + ["\u{0}"]); +assertEqArray(/\u{000000000000000010FFFF}/u.exec("\u{10FFFF}"), + ["\u{10FFFF}"]); + +// RegExp constructor +assertEqArray(new RegExp("\\u{0}", "u").exec("\u{0}"), + ["\u{0}"]); +assertEqArray(new RegExp("\\u{41}", "u").exec("ABC"), + ["A"]); +assertEqArray(new RegExp("\\u{1F438}", "u").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(new RegExp("\\u{10FFFF}", "u").exec("\u{10FFFF}"), + ["\u{10FFFF}"]); + +assertEqArray(new RegExp("\\u{0000000000000000}", "u").exec("\u{0}"), + ["\u{0}"]); + +assertEqArray(eval(`/\\u{${"0".repeat(Math.pow(2, 24)) + "1234"}}/u`).exec("\u{1234}"), + ["\u{1234}"]); +assertEqArray(new RegExp(`\\u{${"0".repeat(Math.pow(2, 24)) + "1234"}}`, "u").exec("\u{1234}"), + ["\u{1234}"]); + +// ==== ? ==== + +assertEqArray(/\u{1F438}?/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\u{1F438}?/u.exec(""), + [""]); + +// lead-only target +assertEqArray(/\u{1F438}?/u.exec("\uD83D"), + [""]); + +// RegExp constructor +assertEqArray(new RegExp("\\u{1F438}?", "u").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(new RegExp("\\u{1F438}?", "u").exec(""), + [""]); +assertEqArray(new RegExp("\\u{1F438}?", "u").exec("\uD83D"), + [""]); + +// ==== + ==== + +assertEqArray(/\u{1F438}+/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\u{1F438}+/u.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}\u{1F438}"]); +assertEq(/\u{1F438}+/u.exec(""), + null); + +// lead-only target +assertEq(/\u{1F438}+/u.exec("\uD83D"), + null); +assertEqArray(/\u{1F438}+/u.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38"]); + +// ==== * ==== + +assertEqArray(/\u{1F438}*/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\u{1F438}*/u.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}\u{1F438}"]); +assertEqArray(/\u{1F438}*/u.exec(""), + [""]); + +// lead-only target +assertEqArray(/\u{1F438}*/u.exec("\uD83D"), + [""]); +assertEqArray(/\u{1F438}*/u.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38"]); + +// ==== lead-only ==== + +// match only non-surrogate pair +assertEqArray(/\u{D83D}/u.exec("\uD83D\uDBFF"), + ["\uD83D"]); +assertEq(/\u{D83D}/u.exec("\uD83D\uDC00"), + null); +assertEq(/\u{D83D}/u.exec("\uD83D\uDFFF"), + null); +assertEqArray(/\u{D83D}/u.exec("\uD83D\uE000"), + ["\uD83D"]); + +// match before non-tail char +assertEqArray(/\u{D83D}/u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\u{D83D}/u.exec("\uD83DA"), + ["\uD83D"]); + +// ==== trail-only ==== + +// match only non-surrogate pair +assertEqArray(/\u{DC38}/u.exec("\uD7FF\uDC38"), + ["\uDC38"]); +assertEq(/\u{DC38}/u.exec("\uD800\uDC38"), + null); +assertEq(/\u{DC38}/u.exec("\uDBFF\uDC38"), + null); +assertEqArray(/\u{DC38}/u.exec("\uDC00\uDC38"), + ["\uDC38"]); + +// match after non-lead char +assertEqArray(/\u{DC38}/u.exec("\uDC38"), + ["\uDC38"]); +assertEqArray(/\u{DC38}/u.exec("A\uDC38"), + ["\uDC38"]); + +// ==== wrong patterns ==== + +assertThrowsInstanceOf(() => eval(`/\\u{-1}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{0.0}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{G}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{{/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{110000}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{00110000}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{100000000000000000000000000000}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{ FFFF}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{FFFF }/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{FF FF}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{F F F F}/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u{100000001}/u`), SyntaxError); + +// surrogate pair with braced +assertEq(/\u{D83D}\u{DC38}+/u.exec("\uD83D\uDC38\uDC38"), + null); +assertEq(/\uD83D\u{DC38}+/u.exec("\uD83D\uDC38\uDC38"), + null); +assertEq(/\u{D83D}\uDC38+/u.exec("\uD83D\uDC38\uDC38"), + null); + +if (typeof reportCompare === "function") + reportCompare(true, true); diff --git a/js/src/tests/ecma_6/RegExp/unicode-lead-trail.js b/js/src/tests/ecma_6/RegExp/unicode-lead-trail.js new file mode 100644 index 000000000..7ecdb9ace --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-lead-trail.js @@ -0,0 +1,218 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- lead and trail patterns in RegExpUnicodeEscapeSequence."; + +print(BUGNUMBER + ": " + summary); + +// ==== standalone ==== + +assertEqArray(/\uD83D\uDC38/u.exec("\u{1F438}"), + ["\u{1F438}"]); + +// no unicode flag +assertEqArray(/\uD83D\uDC38/.exec("\u{1F438}"), + ["\u{1F438}"]); + +// RegExp constructor +assertEqArray(new RegExp("\\uD83D\\uDC38", "u").exec("\u{1F438}"), + ["\u{1F438}"]); + +// RegExp constructor, no unicode flag +assertEqArray(new RegExp("\\uD83D\\uDC38", "").exec("\u{1F438}"), + ["\u{1F438}"]); + +// ==== ? ==== + +assertEqArray(/\uD83D\uDC38?/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38?/u.exec(""), + [""]); + +// lead-only target +assertEqArray(/\uD83D\uDC38?/u.exec("\uD83D"), + [""]); + +// no unicode flag +assertEqArray(/\uD83D\uDC38?/.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(/\uD83D\uDC38?/.exec(""), + null); + +assertEqArray(/\uD83D\uDC38?/.exec("\uD83D"), + ["\uD83D"]); + +// RegExp constructor +assertEqArray(new RegExp("\\uD83D\\uDC38?", "u").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(new RegExp("\\uD83D\\uDC38?", "u").exec(""), + [""]); + +assertEqArray(new RegExp("\\uD83D\\uDC38?", "u").exec("\uD83D"), + [""]); + +// RegExp constructor, no unicode flag +assertEqArray(new RegExp("\\uD83D\\uDC38?", "").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(new RegExp("\\uD83D\\uDC38?", "").exec(""), + null); + +assertEqArray(new RegExp("\\uD83D\\uDC38?", "").exec("\uD83D"), + ["\uD83D"]); + +// ==== + ==== + +assertEqArray(/\uD83D\uDC38+/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38+/u.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}\u{1F438}"]); +assertEq(/\uD83D\uDC38+/u.exec(""), + null); + +// lead-only target +assertEq(/\uD83D\uDC38+/u.exec("\uD83D"), + null); +assertEqArray(/\uD83D\uDC38+/u.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38"]); + +// no unicode flag +assertEqArray(/\uD83D\uDC38+/.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38+/.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}"]); +assertEq(/\uD83D\uDC38+/.exec("\uD83D"), + null); +assertEqArray(/\uD83D\uDC38+/.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38\uDC38"]); +assertEq(/\uD83D\uDC38+/.exec(""), + null); + +// ==== * ==== + +assertEqArray(/\uD83D\uDC38*/u.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38*/u.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}\u{1F438}"]); +assertEqArray(/\uD83D\uDC38*/u.exec(""), + [""]); + +// lead-only target +assertEqArray(/\uD83D\uDC38*/u.exec("\uD83D"), + [""]); +assertEqArray(/\uD83D\uDC38*/u.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38"]); + +// no unicode flag +assertEqArray(/\uD83D\uDC38*/.exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38*/.exec("\u{1F438}\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(/\uD83D\uDC38*/.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83D\uDC38*/.exec("\uD83D\uDC38\uDC38"), + ["\uD83D\uDC38\uDC38"]); +assertEq(/\uD83D\uDC38*/.exec(""), + null); + +// ==== lead-only ==== + +// match only non-surrogate pair +assertEqArray(/\uD83D/u.exec("\uD83D\uDBFF"), + ["\uD83D"]); +assertEq(/\uD83D/u.exec("\uD83D\uDC00"), + null); +assertEq(/\uD83D/u.exec("\uD83D\uDFFF"), + null); +assertEqArray(/\uD83D/u.exec("\uD83D\uE000"), + ["\uD83D"]); + +// match before non-tail char +assertEqArray(/\uD83D/u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83D/u.exec("\uD83DA"), + ["\uD83D"]); + +// no unicode flag +assertEqArray(/\uD83D/.exec("\uD83D\uDBFF"), + ["\uD83D"]); +assertEqArray(/\uD83D/.exec("\uD83D\uDC00"), + ["\uD83D"]); +assertEqArray(/\uD83D/.exec("\uD83D\uDFFF"), + ["\uD83D"]); +assertEqArray(/\uD83D/.exec("\uD83D\uE000"), + ["\uD83D"]); +assertEqArray(/\uD83D/.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83D/.exec("\uD83DA"), + ["\uD83D"]); + +// ==== trail-only ==== + +// match only non-surrogate pair +assertEqArray(/\uDC38/u.exec("\uD7FF\uDC38"), + ["\uDC38"]); +assertEq(/\uDC38/u.exec("\uD800\uDC38"), + null); +assertEq(/\uDC38/u.exec("\uDBFF\uDC38"), + null); +assertEqArray(/\uDC38/u.exec("\uDC00\uDC38"), + ["\uDC38"]); + +// match after non-lead char +assertEqArray(/\uDC38/u.exec("\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/u.exec("A\uDC38"), + ["\uDC38"]); + +// no unicode flag +assertEqArray(/\uDC38/.exec("\uD7FF\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/.exec("\uD800\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/.exec("\uDBFF\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/.exec("\uDC00\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/.exec("\uDC38"), + ["\uDC38"]); +assertEqArray(/\uDC38/.exec("A\uDC38"), + ["\uDC38"]); + +// ==== invalid trail ==== + +assertEqArray(/\uD83D\u3042*/u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83D\u3042*/u.exec("\uD83D\u3042"), + ["\uD83D\u3042"]); +assertEqArray(/\uD83D\u3042*/u.exec("\uD83D\u3042\u3042"), + ["\uD83D\u3042\u3042"]); + +assertEqArray(/\uD83D\u{3042}*/u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83D\u{3042}*/u.exec("\uD83D\u3042"), + ["\uD83D\u3042"]); +assertEqArray(/\uD83D\u{3042}*/u.exec("\uD83D\u3042\u3042"), + ["\uD83D\u3042\u3042"]); + +assertEqArray(/\uD83DA*/u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/\uD83DA*/u.exec("\uD83DA"), + ["\uD83DA"]); +assertEqArray(/\uD83DA*/u.exec("\uD83DAA"), + ["\uD83DAA"]); + +// ==== wrong patterns ==== + +assertThrowsInstanceOf(() => eval(`/\\u/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u0/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u00/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u000/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u000G/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\u0.00/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u0/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u00/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u000/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u000G/u`), SyntaxError); +assertThrowsInstanceOf(() => eval(`/\\uD83D\\u0.00/u`), SyntaxError); + +if (typeof reportCompare === "function") + reportCompare(true, true); diff --git a/js/src/tests/ecma_6/RegExp/unicode-raw.js b/js/src/tests/ecma_6/RegExp/unicode-raw.js new file mode 100644 index 000000000..37b572cd8 --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-raw.js @@ -0,0 +1,139 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- raw unicode."; + +print(BUGNUMBER + ": " + summary); + +// ==== standalone ==== + +assertEqArray(eval(`/\uD83D\uDC38/u`).exec("\u{1F438}"), + ["\u{1F438}"]); + +// no unicode flag +assertEqArray(eval(`/\uD83D\uDC38/`).exec("\u{1F438}"), + ["\u{1F438}"]); + +// escaped (lead) +assertEq(eval(`/\\uD83D\uDC38/u`).exec("\u{1F438}"), + null); +assertEq(eval(`/\\u{D83D}\uDC38/u`).exec("\u{1F438}"), + null); + +// escaped (trail) +assertEq(eval(`/\uD83D\\uDC38/u`).exec("\u{1F438}"), + null); +assertEq(eval(`/\uD83D\\u{DC38}/u`).exec("\u{1F438}"), + null); + +// escaped (lead), no unicode flag +assertEqArray(eval(`/\\uD83D\uDC38/`).exec("\u{1F438}"), + ["\u{1F438}"]); + +// escaped (trail), no unicode flag +assertEqArray(eval(`/\uD83D\\uDC38/`).exec("\u{1F438}"), + ["\u{1F438}"]); + +// ==== RegExp constructor ==== + +assertEqArray(new RegExp("\uD83D\uDC38", "u").exec("\u{1F438}"), + ["\u{1F438}"]); + +// no unicode flag +assertEqArray(new RegExp("\uD83D\uDC38", "").exec("\u{1F438}"), + ["\u{1F438}"]); + +// escaped(lead) +assertEq(new RegExp("\\uD83D\uDC38", "u").exec("\u{1F438}"), + null); +assertEq(new RegExp("\\u{D83D}\uDC38", "u").exec("\u{1F438}"), + null); + +// escaped(trail) +assertEq(new RegExp("\uD83D\\uDC38", "u").exec("\u{1F438}"), + null); +assertEq(new RegExp("\uD83D\\u{DC38}", "u").exec("\u{1F438}"), + null); + +// escaped(lead), no unicode flag +assertEqArray(new RegExp("\\uD83D\uDC38", "").exec("\u{1F438}"), + ["\u{1F438}"]); + +// escaped(trail), no unicode flag +assertEqArray(new RegExp("\uD83D\\uDC38", "").exec("\u{1F438}"), + ["\u{1F438}"]); + +// ==== ? ==== + +assertEqArray(eval(`/\uD83D\uDC38?/u`).exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(eval(`/\uD83D\uDC38?/u`).exec(""), + [""]); + +assertEqArray(eval(`/\uD83D\uDC38?/u`).exec("\uD83D"), + [""]); + +// no unicode flag +assertEqArray(eval(`/\uD83D\uDC38?/`).exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(eval(`/\uD83D\uDC38?/`).exec(""), + null); + +assertEqArray(eval(`/\uD83D\uDC38?/`).exec("\uD83D"), + ["\uD83D"]); + +// escaped (lead) +assertEq(eval(`/\\uD83D\uDC38?/u`).exec("\u{1F438}"), + null); +assertEq(eval(`/\\uD83D\uDC38?/u`).exec(""), + null); + +assertEqArray(eval(`/\\uD83D\uDC38?/u`).exec("\uD83D"), + ["\uD83D"]); + +// escaped (trail) +assertEq(eval(`/\uD83D\\uDC38?/u`).exec("\u{1F438}"), + null); +assertEq(eval(`/\uD83D\\uDC38?/u`).exec(""), + null); + +assertEqArray(eval(`/\uD83D\\uDC38?/u`).exec("\uD83D"), + ["\uD83D"]); + +// escaped (lead), no unicode flag +assertEqArray(eval(`/\\uD83D\uDC38?/`).exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(eval(`/\\uD83D\uDC38?/`).exec(""), + null); + +assertEqArray(eval(`/\\uD83D\uDC38?/`).exec("\uD83D"), + ["\uD83D"]); + +// escaped (trail), no unicode flag +assertEqArray(eval(`/\uD83D\\uDC38?/`).exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(eval(`/\uD83D\\uDC38?/`).exec(""), + null); + +assertEqArray(eval(`/\uD83D\\uDC38?/`).exec("\uD83D"), + ["\uD83D"]); + +// ==== RegExp constructor, ? ==== + +assertEqArray(new RegExp("\uD83D\uDC38?", "u").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEqArray(new RegExp("\uD83D\uDC38?", "u").exec(""), + [""]); + +assertEqArray(new RegExp("\uD83D\uDC38?", "u").exec("\uD83D"), + [""]); + +// no unicode flag +assertEqArray(new RegExp("\uD83D\uDC38?", "").exec("\u{1F438}"), + ["\u{1F438}"]); +assertEq(new RegExp("\uD83D\uDC38?", "").exec(""), + null); + +assertEqArray(new RegExp("\uD83D\uDC38?", "").exec("\uD83D"), + ["\uD83D"]); + +if (typeof reportCompare === "function") + reportCompare(true, true); diff --git a/js/src/vm/Unicode.h b/js/src/vm/Unicode.h index ea853442c..1276e915a 100644 --- a/js/src/vm/Unicode.h +++ b/js/src/vm/Unicode.h @@ -234,6 +234,44 @@ CanLowerCase(char16_t ch) return CharInfo(ch).lowerCase != 0; } +const size_t LeadSurrogateMin = 0xD800; +const size_t LeadSurrogateMax = 0xDBFF; +const size_t TrailSurrogateMin = 0xDC00; +const size_t TrailSurrogateMax = 0xDFFF; +const size_t UTF16Max = 0xFFFF; +const size_t NonBMPMin = 0x10000; +const size_t NonBMPMax = 0x10FFFF; + +inline bool +IsLeadSurrogate(size_t value) +{ + return value >= LeadSurrogateMin && value <= LeadSurrogateMax; +} + +inline bool +IsTrailSurrogate(size_t value) +{ + return value >= TrailSurrogateMin && value <= TrailSurrogateMax; +} + +inline void +UTF16Encode(size_t cp, size_t* lead, size_t* trail) +{ + MOZ_ASSERT(cp >= NonBMPMin && cp <= NonBMPMax); + + *lead = (cp - NonBMPMin) / 1024 + LeadSurrogateMin; + *trail = ((cp - NonBMPMin) % 1024) + TrailSurrogateMin; +} + +inline size_t +UTF16Decode(size_t lead, size_t trail) +{ + MOZ_ASSERT(IsLeadSurrogate(lead)); + MOZ_ASSERT(IsTrailSurrogate(trail)); + + return (lead - LeadSurrogateMin) * 1024 + (trail - TrailSurrogateMin) + NonBMPMin; +} + } /* namespace unicode */ } /* namespace js */ diff --git a/js/src/vm/Xdr.h b/js/src/vm/Xdr.h index f15d5affc..b88c78e7a 100644 --- a/js/src/vm/Xdr.h +++ b/js/src/vm/Xdr.h @@ -29,11 +29,11 @@ namespace js { * * https://developer.mozilla.org/en-US/docs/SpiderMonkey/Internals/Bytecode */ -static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 332; +static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 333; static const uint32_t XDR_BYTECODE_VERSION = uint32_t(0xb973c0de - XDR_BYTECODE_VERSION_SUBTRAHEND); -static_assert(JSErr_Limit == 425, +static_assert(JSErr_Limit == 428, "GREETINGS, POTENTIAL SUBTRAHEND INCREMENTER! If you added or " "removed MSG_DEFs from js.msg, you should increment " "XDR_BYTECODE_VERSION_SUBTRAHEND and update this assertion's "