diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 731483442..53496792f 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -73,12 +73,30 @@ static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0xFEFF, 0xFF00, 0x10000 }; static const int kSpaceRangeCount = ArrayLength(kSpaceRanges); +static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, + 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, + 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, + unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, + 0xFEFF, 0xFF00, 0x10000 }; +static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges); static const int kWordRanges[] = { '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 }; static const int kWordRangeCount = ArrayLength(kWordRanges); +static const int kWordAndSurrogateRanges[] = { + '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, + unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, + 0x10000 }; +static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges); static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 }; static const int kDigitRangeCount = ArrayLength(kDigitRanges); -static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 }; +static const int kDigitAndSurrogateRanges[] = { + '0', '9' + 1, + unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, + 0x10000 }; +static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges); +static const int kSurrogateRanges[] = { + unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1, + 0x10000 }; static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges); static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, 0x10000 }; @@ -165,6 +183,26 @@ CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type, } } +// Add class escape, excluding surrogate pair range. +void +CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, + CharacterRangeVector* ranges) +{ + switch (type) { + case 'S': + AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges); + break; + case 'W': + AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges); + break; + case 'D': + AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges); + break; + default: + MOZ_CRASH("Bad type!"); + } +} + // We need to check for the following characters: 0x39c 0x3bc 0x178. static inline bool RangeContainsLatin1Equivalents(CharacterRange range) diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h index bf5766086..8b8821eaa 100644 --- a/js/src/irregexp/RegExpEngine.h +++ b/js/src/irregexp/RegExpEngine.h @@ -144,6 +144,8 @@ class CharacterRange {} static void AddClassEscape(LifoAlloc* alloc, char16_t type, CharacterRangeVector* ranges); + static void AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type, + CharacterRangeVector* ranges); static inline CharacterRange Singleton(char16_t value) { return CharacterRange(value, value); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index ad09996e4..976a87d8d 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -611,9 +611,23 @@ AddCharOrEscapeUnicode(LifoAlloc* alloc, char16_t char_class, widechar c) { - if (char_class != kNoCharClass) - CharacterRange::AddClassEscape(alloc, char_class, ranges); - else if (unicode::IsLeadSurrogate(c)) + if (char_class != kNoCharClass) { + CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges); + switch (char_class) { + case 'S': + case 'W': + case 'D': + lead_ranges->append(LeadSurrogateRange()); + trail_ranges->append(TrailSurrogateRange()); + wide_ranges->append(NonBMPRange()); + break; + case '.': + MOZ_CRASH("Bad char_class!"); + } + return; + } + + if (unicode::IsLeadSurrogate(c)) lead_ranges->append(CharacterRange::Singleton(c)); else if (unicode::IsTrailSurrogate(c)) trail_ranges->append(CharacterRange::Singleton(c)); @@ -1213,6 +1227,18 @@ UnicodeEverythingAtom(LifoAlloc* alloc) return builder->ToRegExp(); } +RegExpTree* +UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class) +{ + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + CharacterRangeVector* lead_ranges = alloc->newInfallible(*alloc); + CharacterRangeVector* trail_ranges = alloc->newInfallible(*alloc); + WideCharRangeVector* wide_ranges = alloc->newInfallible(*alloc); + AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0); + + return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false); +} + // Disjunction :: // Alternative // Alternative | Disjunction @@ -1377,7 +1403,15 @@ RegExpParser::ParseDisjunction() // // CharacterClassEscape :: one of // d D s S w W - case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { + case 'D': case 'S': case 'W': + if (unicode_) { + Advance(); + builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current())); + Advance(); + break; + } + // Fall through + case 'd': case 's': case 'w': { widechar c = Next(); Advance(2); CharacterRangeVector* ranges = diff --git a/js/src/tests/ecma_6/RegExp/unicode-character-class-escape.js b/js/src/tests/ecma_6/RegExp/unicode-character-class-escape.js new file mode 100644 index 000000000..175207d5a --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-character-class-escape.js @@ -0,0 +1,75 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- CharacterClassEscape."; + +print(BUGNUMBER + ": " + summary); + +// BMP + +assertEqArray(/\d+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["0123456789"]); +assertEqArray(/\D+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ"]); + +assertEqArray(/\s+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\t\r\n\v\x0c\xa0\uFEFF"]); +assertEqArray(/\S+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ0123456789_"]); + +assertEqArray(/\w+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ0123456789_"]); +assertEqArray(/\W+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\t\r\n\v\x0c\xa0\uFEFF*"]); + +assertEqArray(/\n+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\n"]); + +assertEqArray(/[\d]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["0123456789"]); +assertEqArray(/[\D]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ"]); + +assertEqArray(/[\s]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\t\r\n\v\x0c\xa0\uFEFF"]); +assertEqArray(/[\S]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ0123456789_"]); + +assertEqArray(/[\w]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["abcxyzABCXYZ0123456789_"]); +assertEqArray(/[\W]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\t\r\n\v\x0c\xa0\uFEFF*"]); + +assertEqArray(/[\n]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"), + ["\n"]); + +// non-BMP + +function testNonBMP(re) { + assertEqArray(re.exec("\uD83D\uDBFF"), + ["\uD83D"]); + assertEqArray(re.exec("\uD83D\uDC00"), + ["\uD83D\uDC00"]); + assertEqArray(re.exec("\uD83D\uDFFF"), + ["\uD83D\uDFFF"]); + assertEqArray(re.exec("\uD83D\uE000"), + ["\uD83D"]); + + assertEqArray(re.exec("\uD7FF\uDC38"), + ["\uD7FF"]); + assertEqArray(re.exec("\uD800\uDC38"), + ["\uD800\uDC38"]); + assertEqArray(re.exec("\uDBFF\uDC38"), + ["\uDBFF\uDC38"]); + assertEqArray(re.exec("\uDC00\uDC38"), + ["\uDC00"]); +} + +testNonBMP(/\D/u); +testNonBMP(/\S/u); +testNonBMP(/\W/u); + +testNonBMP(/[\D]/u); +testNonBMP(/[\S]/u); +testNonBMP(/[\W]/u); + +if (typeof reportCompare === "function") + reportCompare(true, true);