From f31a9f9e84f905220366aeb6780b0bae0a72b435 Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Fri, 7 Aug 2015 08:12:05 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 7: Support ignoreCase for non-BMP in RegExp with unicode flag. r=till, f=anba --- js/src/irregexp/RegExpParser.cpp | 104 +++++++- .../ecma_6/RegExp/unicode-class-ignoreCase.js | 28 ++ .../tests/ecma_6/RegExp/unicode-ignoreCase.js | 246 ++++++++++++++++++ js/src/vm/Unicode.h | 8 + js/src/vm/make_unicode.py | 6 +- 5 files changed, 380 insertions(+), 12 deletions(-) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-class-ignoreCase.js diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index 1d582052a..52ee9942c 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -558,6 +558,7 @@ class WideCharRange return WideCharRange(from, to); } + bool Contains(widechar i) const { return from_ <= i && i <= to_; } widechar from() const { return from_; } widechar to() const { return to_; } @@ -747,14 +748,73 @@ NegateUnicodeRanges(LifoAlloc* alloc, Vectorlength(); i++) { + const WideCharRange& range = (*wide_ranges)[i]; + if (range.Contains(c)) + return true; + } + return false; +} + +static void +CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff, + WideCharRangeVector* wide_ranges, + WideCharRangeVector** tmp_wide_ranges) +{ + widechar contains_from = 0; + widechar contains_to = 0; + for (widechar c = from; c <= to; c++) { + if (WideCharRangesContain(wide_ranges, c) && + !WideCharRangesContain(wide_ranges, c + diff)) + { + if (contains_from == 0) + contains_from = c; + contains_to = c; + } else if (contains_from != 0) { + if (!*tmp_wide_ranges) + *tmp_wide_ranges = alloc->newInfallible(*alloc); + + (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, + contains_to + diff)); + contains_from = 0; + } + } + + if (contains_from != 0) { + if (!*tmp_wide_ranges) + *tmp_wide_ranges = alloc->newInfallible(*alloc); + + (*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff, + contains_to + diff)); + } +} + static RegExpTree* UnicodeRangesAtom(LifoAlloc* alloc, CharacterRangeVector* ranges, CharacterRangeVector* lead_ranges, CharacterRangeVector* trail_ranges, WideCharRangeVector* wide_ranges, - bool is_negated) + bool is_negated, + bool ignore_case) { + // Calculate case folding for non-BMP first and negate the range if needed. + if (ignore_case) { + WideCharRangeVector* tmp_wide_ranges = nullptr; +#define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges); + FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC) +#undef CALL_CALC + + if (tmp_wide_ranges) { + for (size_t i = 0; i < tmp_wide_ranges->length(); i++) + wide_ranges->append((*tmp_wide_ranges)[i]); + } + } + if (is_negated) { NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange()); NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange()); @@ -952,7 +1012,8 @@ RegExpParser::ParseCharacterClass() return alloc->newInfallible(ranges, true); } - return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated); + return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated, + ignore_case_); } template @@ -1166,8 +1227,30 @@ RegExpParser::ParsePattern() } static inline RegExpTree* -SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail) +CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff) { + RegExpBuilder* builder = alloc->newInfallible(alloc); + + builder->AddCharacter(lead); + CharacterRangeVector* ranges = alloc->newInfallible(*alloc); + ranges->append(CharacterRange::Range(trail, trail)); + ranges->append(CharacterRange::Range(trail + diff, trail + diff)); + builder->AddAtom(alloc->newInfallible(ranges, false)); + + return builder->ToRegExp(); +} + +static inline RegExpTree* +SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case) +{ + if (ignore_case) { +#define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ + if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \ + return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF); + FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM) +#undef CALL_ATOM + } + RegExpBuilder* builder = alloc->newInfallible(alloc); builder->AddCharacter(lead); builder->AddCharacter(trail); @@ -1239,7 +1322,7 @@ UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool igno AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0, ignore_case); - return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false); + return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false); } // Disjunction :: @@ -1523,17 +1606,20 @@ RegExpParser::ParseDisjunction() } else if (value >= unicode::NonBMPMin) { size_t lead, trail; unicode::UTF16Encode(value, &lead, &trail); - builder->AddAtom(SurrogatePairAtom(alloc, lead, trail)); + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, + ignore_case_)); } else { builder->AddCharacter(value); } } else if (ParseHexEscape(4, &value)) { if (unicode::IsLeadSurrogate(value)) { size_t trail; - if (ParseTrailSurrogate(&trail)) - builder->AddAtom(SurrogatePairAtom(alloc, value, trail)); - else + if (ParseTrailSurrogate(&trail)) { + builder->AddAtom(SurrogatePairAtom(alloc, value, trail, + ignore_case_)); + } else { builder->AddAtom(LeadSurrogateAtom(alloc, value)); + } } else if (unicode::IsTrailSurrogate(value)) { builder->AddAtom(TrailSurrogateAtom(alloc, value)); } else { @@ -1568,7 +1654,7 @@ RegExpParser::ParseDisjunction() if (unicode_) { char16_t lead, trail; if (ParseRawSurrogatePair(&lead, &trail)) { - builder->AddAtom(SurrogatePairAtom(alloc, lead, trail)); + builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_)); } else { widechar c = current(); if (unicode::IsLeadSurrogate(c)) diff --git a/js/src/tests/ecma_6/RegExp/unicode-class-ignoreCase.js b/js/src/tests/ecma_6/RegExp/unicode-class-ignoreCase.js new file mode 100644 index 000000000..afa7705c7 --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-class-ignoreCase.js @@ -0,0 +1,28 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- ignoreCase flag for CharacterClass."; + +print(BUGNUMBER + ": " + summary); + +assertEqArray(/[ABC]+/iu.exec("DCBAabcd"), + ["CBAabc"]); + +assertEqArray(/[A\u{10401}]+/iu.exec("A\u{10401}a\u{10429}"), + ["A\u{10401}a\u{10429}"]); + +assertEqArray(/[\u{10401}-\u{10404}\u{10408}-\u{1040B}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"), + ["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]); +assertEqArray(/[\u{10401}-\u{10404}\u{10408}-\u{1040B}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"), + ["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]); + +assertEqArray(/[\u{10429}-\u{1042C}\u{10430}-\u{10433}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"), + ["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]); +assertEqArray(/[\u{10429}-\u{1042C}\u{10430}-\u{10433}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"), + ["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]); + +assertEqArray(/[\u{10401}-\u{10404}\u{10430}-\u{10433}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"), + ["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]); +assertEqArray(/[\u{10401}-\u{10404}\u{10430}-\u{10433}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"), + ["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]); + +if (typeof reportCompare === "function") + reportCompare(true, true); diff --git a/js/src/tests/ecma_6/RegExp/unicode-ignoreCase.js b/js/src/tests/ecma_6/RegExp/unicode-ignoreCase.js index 1eb8a0889..dac67abba 100644 --- a/js/src/tests/ecma_6/RegExp/unicode-ignoreCase.js +++ b/js/src/tests/ecma_6/RegExp/unicode-ignoreCase.js @@ -2240,6 +2240,252 @@ test(0xff57,0xff37); test(0xff58,0xff38); test(0xff59,0xff39); test(0xff5a,0xff3a); +test(0x10400,0x10428); +test(0x10401,0x10429); +test(0x10402,0x1042a); +test(0x10403,0x1042b); +test(0x10404,0x1042c); +test(0x10405,0x1042d); +test(0x10406,0x1042e); +test(0x10407,0x1042f); +test(0x10408,0x10430); +test(0x10409,0x10431); +test(0x1040a,0x10432); +test(0x1040b,0x10433); +test(0x1040c,0x10434); +test(0x1040d,0x10435); +test(0x1040e,0x10436); +test(0x1040f,0x10437); +test(0x10410,0x10438); +test(0x10411,0x10439); +test(0x10412,0x1043a); +test(0x10413,0x1043b); +test(0x10414,0x1043c); +test(0x10415,0x1043d); +test(0x10416,0x1043e); +test(0x10417,0x1043f); +test(0x10418,0x10440); +test(0x10419,0x10441); +test(0x1041a,0x10442); +test(0x1041b,0x10443); +test(0x1041c,0x10444); +test(0x1041d,0x10445); +test(0x1041e,0x10446); +test(0x1041f,0x10447); +test(0x10420,0x10448); +test(0x10421,0x10449); +test(0x10422,0x1044a); +test(0x10423,0x1044b); +test(0x10424,0x1044c); +test(0x10425,0x1044d); +test(0x10426,0x1044e); +test(0x10427,0x1044f); +test(0x10428,0x10400); +test(0x10429,0x10401); +test(0x1042a,0x10402); +test(0x1042b,0x10403); +test(0x1042c,0x10404); +test(0x1042d,0x10405); +test(0x1042e,0x10406); +test(0x1042f,0x10407); +test(0x10430,0x10408); +test(0x10431,0x10409); +test(0x10432,0x1040a); +test(0x10433,0x1040b); +test(0x10434,0x1040c); +test(0x10435,0x1040d); +test(0x10436,0x1040e); +test(0x10437,0x1040f); +test(0x10438,0x10410); +test(0x10439,0x10411); +test(0x1043a,0x10412); +test(0x1043b,0x10413); +test(0x1043c,0x10414); +test(0x1043d,0x10415); +test(0x1043e,0x10416); +test(0x1043f,0x10417); +test(0x10440,0x10418); +test(0x10441,0x10419); +test(0x10442,0x1041a); +test(0x10443,0x1041b); +test(0x10444,0x1041c); +test(0x10445,0x1041d); +test(0x10446,0x1041e); +test(0x10447,0x1041f); +test(0x10448,0x10420); +test(0x10449,0x10421); +test(0x1044a,0x10422); +test(0x1044b,0x10423); +test(0x1044c,0x10424); +test(0x1044d,0x10425); +test(0x1044e,0x10426); +test(0x1044f,0x10427); +test(0x10c80,0x10cc0); +test(0x10c81,0x10cc1); +test(0x10c82,0x10cc2); +test(0x10c83,0x10cc3); +test(0x10c84,0x10cc4); +test(0x10c85,0x10cc5); +test(0x10c86,0x10cc6); +test(0x10c87,0x10cc7); +test(0x10c88,0x10cc8); +test(0x10c89,0x10cc9); +test(0x10c8a,0x10cca); +test(0x10c8b,0x10ccb); +test(0x10c8c,0x10ccc); +test(0x10c8d,0x10ccd); +test(0x10c8e,0x10cce); +test(0x10c8f,0x10ccf); +test(0x10c90,0x10cd0); +test(0x10c91,0x10cd1); +test(0x10c92,0x10cd2); +test(0x10c93,0x10cd3); +test(0x10c94,0x10cd4); +test(0x10c95,0x10cd5); +test(0x10c96,0x10cd6); +test(0x10c97,0x10cd7); +test(0x10c98,0x10cd8); +test(0x10c99,0x10cd9); +test(0x10c9a,0x10cda); +test(0x10c9b,0x10cdb); +test(0x10c9c,0x10cdc); +test(0x10c9d,0x10cdd); +test(0x10c9e,0x10cde); +test(0x10c9f,0x10cdf); +test(0x10ca0,0x10ce0); +test(0x10ca1,0x10ce1); +test(0x10ca2,0x10ce2); +test(0x10ca3,0x10ce3); +test(0x10ca4,0x10ce4); +test(0x10ca5,0x10ce5); +test(0x10ca6,0x10ce6); +test(0x10ca7,0x10ce7); +test(0x10ca8,0x10ce8); +test(0x10ca9,0x10ce9); +test(0x10caa,0x10cea); +test(0x10cab,0x10ceb); +test(0x10cac,0x10cec); +test(0x10cad,0x10ced); +test(0x10cae,0x10cee); +test(0x10caf,0x10cef); +test(0x10cb0,0x10cf0); +test(0x10cb1,0x10cf1); +test(0x10cb2,0x10cf2); +test(0x10cc0,0x10c80); +test(0x10cc1,0x10c81); +test(0x10cc2,0x10c82); +test(0x10cc3,0x10c83); +test(0x10cc4,0x10c84); +test(0x10cc5,0x10c85); +test(0x10cc6,0x10c86); +test(0x10cc7,0x10c87); +test(0x10cc8,0x10c88); +test(0x10cc9,0x10c89); +test(0x10cca,0x10c8a); +test(0x10ccb,0x10c8b); +test(0x10ccc,0x10c8c); +test(0x10ccd,0x10c8d); +test(0x10cce,0x10c8e); +test(0x10ccf,0x10c8f); +test(0x10cd0,0x10c90); +test(0x10cd1,0x10c91); +test(0x10cd2,0x10c92); +test(0x10cd3,0x10c93); +test(0x10cd4,0x10c94); +test(0x10cd5,0x10c95); +test(0x10cd6,0x10c96); +test(0x10cd7,0x10c97); +test(0x10cd8,0x10c98); +test(0x10cd9,0x10c99); +test(0x10cda,0x10c9a); +test(0x10cdb,0x10c9b); +test(0x10cdc,0x10c9c); +test(0x10cdd,0x10c9d); +test(0x10cde,0x10c9e); +test(0x10cdf,0x10c9f); +test(0x10ce0,0x10ca0); +test(0x10ce1,0x10ca1); +test(0x10ce2,0x10ca2); +test(0x10ce3,0x10ca3); +test(0x10ce4,0x10ca4); +test(0x10ce5,0x10ca5); +test(0x10ce6,0x10ca6); +test(0x10ce7,0x10ca7); +test(0x10ce8,0x10ca8); +test(0x10ce9,0x10ca9); +test(0x10cea,0x10caa); +test(0x10ceb,0x10cab); +test(0x10cec,0x10cac); +test(0x10ced,0x10cad); +test(0x10cee,0x10cae); +test(0x10cef,0x10caf); +test(0x10cf0,0x10cb0); +test(0x10cf1,0x10cb1); +test(0x10cf2,0x10cb2); +test(0x118a0,0x118c0); +test(0x118a1,0x118c1); +test(0x118a2,0x118c2); +test(0x118a3,0x118c3); +test(0x118a4,0x118c4); +test(0x118a5,0x118c5); +test(0x118a6,0x118c6); +test(0x118a7,0x118c7); +test(0x118a8,0x118c8); +test(0x118a9,0x118c9); +test(0x118aa,0x118ca); +test(0x118ab,0x118cb); +test(0x118ac,0x118cc); +test(0x118ad,0x118cd); +test(0x118ae,0x118ce); +test(0x118af,0x118cf); +test(0x118b0,0x118d0); +test(0x118b1,0x118d1); +test(0x118b2,0x118d2); +test(0x118b3,0x118d3); +test(0x118b4,0x118d4); +test(0x118b5,0x118d5); +test(0x118b6,0x118d6); +test(0x118b7,0x118d7); +test(0x118b8,0x118d8); +test(0x118b9,0x118d9); +test(0x118ba,0x118da); +test(0x118bb,0x118db); +test(0x118bc,0x118dc); +test(0x118bd,0x118dd); +test(0x118be,0x118de); +test(0x118bf,0x118df); +test(0x118c0,0x118a0); +test(0x118c1,0x118a1); +test(0x118c2,0x118a2); +test(0x118c3,0x118a3); +test(0x118c4,0x118a4); +test(0x118c5,0x118a5); +test(0x118c6,0x118a6); +test(0x118c7,0x118a7); +test(0x118c8,0x118a8); +test(0x118c9,0x118a9); +test(0x118ca,0x118aa); +test(0x118cb,0x118ab); +test(0x118cc,0x118ac); +test(0x118cd,0x118ad); +test(0x118ce,0x118ae); +test(0x118cf,0x118af); +test(0x118d0,0x118b0); +test(0x118d1,0x118b1); +test(0x118d2,0x118b2); +test(0x118d3,0x118b3); +test(0x118d4,0x118b4); +test(0x118d5,0x118b5); +test(0x118d6,0x118b6); +test(0x118d7,0x118b7); +test(0x118d8,0x118b8); +test(0x118d9,0x118b9); +test(0x118da,0x118ba); +test(0x118db,0x118bb); +test(0x118dc,0x118bc); +test(0x118dd,0x118bd); +test(0x118de,0x118be); +test(0x118df,0x118bf); if (typeof reportCompare === "function") reportCompare(true, true); diff --git a/js/src/vm/Unicode.h b/js/src/vm/Unicode.h index 3ab452c2a..16ae39891 100644 --- a/js/src/vm/Unicode.h +++ b/js/src/vm/Unicode.h @@ -324,4 +324,12 @@ UTF16Decode(size_t lead, size_t trail) } /* namespace unicode */ } /* namespace js */ +#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \ + macro(0x10400, 0x10427, 0xD801, 0xDC00, 0xDC27, 0x28) \ + macro(0x10428, 0x1044F, 0xD801, 0xDC28, 0xDC4F, -0x28) \ + macro(0x10C80, 0x10CB2, 0xD803, 0xDC80, 0xDCB2, 0x40) \ + macro(0x10CC0, 0x10CF2, 0xD803, 0xDCC0, 0xDCF2, -0x40) \ + macro(0x118A0, 0x118bf, 0xD806, 0xDCA0, 0xDCBF, 0x20) \ + macro(0x118C0, 0x118df, 0xD806, 0xDCC0, 0xDCDF, -0x20) + #endif /* vm_Unicode_h */ diff --git a/js/src/vm/make_unicode.py b/js/src/vm/make_unicode.py index 19c0aab58..63d9f0654 100644 --- a/js/src/vm/make_unicode.py +++ b/js/src/vm/make_unicode.py @@ -177,9 +177,6 @@ def generate_unicode_stuff(unicode_data, case_folding, folding_codes.add(mapping) for code in sorted(folding_codes): - if code > MAX: - continue - if code in folding_map: folding = folding_map[code] else: @@ -200,6 +197,9 @@ def generate_unicode_stuff(unicode_data, case_folding, item.append(folding) folding_tests.append(item + rev_folding) + if code > MAX: + continue + folding_d = folding - code rev_folding_ds = [v - code for v in rev_folding]