From 4e924a688ea178c5846bfb5071c1c77f614d88af Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Fri, 7 Aug 2015 08:11:52 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 4: Support everything Atom in RegExp with unicode flag. r=till, f=anba --- js/src/irregexp/RegExpParser.cpp | 39 ++++++++++++ .../tests/ecma_6/RegExp/unicode-everything.js | 59 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-everything.js diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index bbb07f1c3..ad09996e4 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -1178,6 +1178,41 @@ TrailSurrogateAtom(LifoAlloc* alloc, char16_t value) return builder->ToRegExp(); } +static inline RegExpTree* +UnicodeEverythingAtom(LifoAlloc* alloc) +{ + RegExpBuilder* builder = alloc->newInfallible(alloc); + + // everything except \x0a, \x0d, \u2028 and \u2029 + + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(0x0, 0x09)); + ranges->append(CharacterRange::Range(0x0b, 0x0c)); + ranges->append(CharacterRange::Range(0x0e, 0x2027)); + ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1)); + ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max)); + builder->AddAtom(alloc->newInfallible(ranges, false)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin, + unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_AFTER_LEAD_SURROGATE)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + builder->NewAlternative(); + + builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax)); + builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax)); + + return builder->ToRegExp(); +} + // Disjunction :: // Alternative // Alternative | Disjunction @@ -1275,6 +1310,10 @@ RegExpParser::ParseDisjunction() case '.': { Advance(); // everything except \x0a, \x0d, \u2028 and \u2029 + if (unicode_) { + builder->AddAtom(UnicodeEverythingAtom(alloc)); + break; + } CharacterRangeVector* ranges = alloc->newInfallible(*alloc); CharacterRange::AddClassEscape(alloc, '.', ranges); RegExpTree* atom = alloc->newInfallible(ranges, false); diff --git a/js/src/tests/ecma_6/RegExp/unicode-everything.js b/js/src/tests/ecma_6/RegExp/unicode-everything.js new file mode 100644 index 000000000..a18ac2867 --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-everything.js @@ -0,0 +1,59 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- everything Atom."; + +print(BUGNUMBER + ": " + summary); + +// ==== standalone ==== + +assertEqArray(/./u.exec("ABC"), + ["A"]); +assertEqArray(/./u.exec("\u{1F438}BC"), + ["\u{1F438}"]); + +assertEqArray(/./u.exec("\uD83D\uDBFF"), + ["\uD83D"]); +assertEqArray(/./u.exec("\uD83D\uDC00"), + ["\uD83D\uDC00"]); +assertEqArray(/./u.exec("\uD83D\uDFFF"), + ["\uD83D\uDFFF"]); +assertEqArray(/./u.exec("\uD83D\uE000"), + ["\uD83D"]); +assertEqArray(/./u.exec("\uD83D"), + ["\uD83D"]); +assertEqArray(/./u.exec("\uD83DA"), + ["\uD83D"]); + +assertEqArray(/./u.exec("\uD7FF\uDC38"), + ["\uD7FF"]); +assertEqArray(/./u.exec("\uD800\uDC38"), + ["\uD800\uDC38"]); +assertEqArray(/./u.exec("\uDBFF\uDC38"), + ["\uDBFF\uDC38"]); +assertEqArray(/./u.exec("\uDC00\uDC38"), + ["\uDC00"]); +assertEqArray(/./u.exec("\uDC38"), + ["\uDC38"]); +assertEqArray(/./u.exec("A\uDC38"), + ["A"]); + +assertEqArray(/.A/u.exec("\uD7FF\uDC38A"), + ["\uDC38A"]); +assertEqArray(/.A/u.exec("\uD800\uDC38A"), + ["\uD800\uDC38A"]); +assertEqArray(/.A/u.exec("\uDBFF\uDC38A"), + ["\uDBFF\uDC38A"]); +assertEqArray(/.A/u.exec("\uDC00\uDC38A"), + ["\uDC38A"]); + +// ==== leading multiple ==== + +assertEqArray(/.*A/u.exec("\u{1F438}\u{1F438}\u{1F438}A"), + ["\u{1F438}\u{1F438}\u{1F438}A"]); + +// ==== trailing multiple ==== + +assertEqArray(/A.*/u.exec("A\u{1F438}\u{1F438}\u{1F438}"), + ["A\u{1F438}\u{1F438}\u{1F438}"]); + +if (typeof reportCompare === "function") + reportCompare(true, true);