#393, Bug 1135377 - Part 7: Support ignoreCase for non-BMP in RegExp with unicode flag. r=till, f=anba

This commit is contained in:
Tooru Fujisawa 2015-08-07 08:12:05 +09:00 committed by Cameron Kaiser
parent cf744e9d35
commit f31a9f9e84
5 changed files with 380 additions and 12 deletions

View File

@ -558,6 +558,7 @@ class WideCharRange
return WideCharRange(from, to);
}
bool Contains(widechar i) const { return from_ <= i && i <= to_; }
widechar from() const { return from_; }
widechar to() const { return to_; }
@ -747,14 +748,73 @@ NegateUnicodeRanges(LifoAlloc* alloc, Vector<RangeType, 1, LifoAllocPolicy<Infal
*ranges = tmp_ranges;
}
static bool
WideCharRangesContain(WideCharRangeVector* wide_ranges, widechar c)
{
for (size_t i = 0; i < wide_ranges->length(); i++) {
const WideCharRange& range = (*wide_ranges)[i];
if (range.Contains(c))
return true;
}
return false;
}
static void
CalculateCaseInsensitiveRanges(LifoAlloc* alloc, widechar from, widechar to, int32_t diff,
WideCharRangeVector* wide_ranges,
WideCharRangeVector** tmp_wide_ranges)
{
widechar contains_from = 0;
widechar contains_to = 0;
for (widechar c = from; c <= to; c++) {
if (WideCharRangesContain(wide_ranges, c) &&
!WideCharRangesContain(wide_ranges, c + diff))
{
if (contains_from == 0)
contains_from = c;
contains_to = c;
} else if (contains_from != 0) {
if (!*tmp_wide_ranges)
*tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
(*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
contains_to + diff));
contains_from = 0;
}
}
if (contains_from != 0) {
if (!*tmp_wide_ranges)
*tmp_wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
(*tmp_wide_ranges)->append(WideCharRange::Range(contains_from + diff,
contains_to + diff));
}
}
static RegExpTree*
UnicodeRangesAtom(LifoAlloc* alloc,
CharacterRangeVector* ranges,
CharacterRangeVector* lead_ranges,
CharacterRangeVector* trail_ranges,
WideCharRangeVector* wide_ranges,
bool is_negated)
bool is_negated,
bool ignore_case)
{
// Calculate case folding for non-BMP first and negate the range if needed.
if (ignore_case) {
WideCharRangeVector* tmp_wide_ranges = nullptr;
#define CALL_CALC(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
CalculateCaseInsensitiveRanges(alloc, FROM, TO, DIFF, wide_ranges, &tmp_wide_ranges);
FOR_EACH_NON_BMP_CASE_FOLDING(CALL_CALC)
#undef CALL_CALC
if (tmp_wide_ranges) {
for (size_t i = 0; i < tmp_wide_ranges->length(); i++)
wide_ranges->append((*tmp_wide_ranges)[i]);
}
}
if (is_negated) {
NegateUnicodeRanges(alloc, &lead_ranges, LeadSurrogateRange());
NegateUnicodeRanges(alloc, &trail_ranges, TrailSurrogateRange());
@ -952,7 +1012,8 @@ RegExpParser<CharT>::ParseCharacterClass()
return alloc->newInfallible<RegExpCharacterClass>(ranges, true);
}
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated);
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, is_negated,
ignore_case_);
}
template <typename CharT>
@ -1166,8 +1227,30 @@ RegExpParser<CharT>::ParsePattern()
}
static inline RegExpTree*
SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail)
CaseFoldingSurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, int32_t diff)
{
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
builder->AddCharacter(lead);
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
ranges->append(CharacterRange::Range(trail, trail));
ranges->append(CharacterRange::Range(trail + diff, trail + diff));
builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
return builder->ToRegExp();
}
static inline RegExpTree*
SurrogatePairAtom(LifoAlloc* alloc, char16_t lead, char16_t trail, bool ignore_case)
{
if (ignore_case) {
#define CALL_ATOM(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
if (lead == LEAD &&trail >= TRAIL_FROM && trail <= TRAIL_TO) \
return CaseFoldingSurrogatePairAtom(alloc, lead, trail, DIFF);
FOR_EACH_NON_BMP_CASE_FOLDING(CALL_ATOM)
#undef CALL_ATOM
}
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
builder->AddCharacter(lead);
builder->AddCharacter(trail);
@ -1239,7 +1322,7 @@ UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool igno
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
ignore_case);
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false);
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
}
// Disjunction ::
@ -1523,17 +1606,20 @@ RegExpParser<CharT>::ParseDisjunction()
} else if (value >= unicode::NonBMPMin) {
size_t lead, trail;
unicode::UTF16Encode(value, &lead, &trail);
builder->AddAtom(SurrogatePairAtom(alloc, lead, trail));
builder->AddAtom(SurrogatePairAtom(alloc, lead, trail,
ignore_case_));
} else {
builder->AddCharacter(value);
}
} else if (ParseHexEscape(4, &value)) {
if (unicode::IsLeadSurrogate(value)) {
size_t trail;
if (ParseTrailSurrogate(&trail))
builder->AddAtom(SurrogatePairAtom(alloc, value, trail));
else
if (ParseTrailSurrogate(&trail)) {
builder->AddAtom(SurrogatePairAtom(alloc, value, trail,
ignore_case_));
} else {
builder->AddAtom(LeadSurrogateAtom(alloc, value));
}
} else if (unicode::IsTrailSurrogate(value)) {
builder->AddAtom(TrailSurrogateAtom(alloc, value));
} else {
@ -1568,7 +1654,7 @@ RegExpParser<CharT>::ParseDisjunction()
if (unicode_) {
char16_t lead, trail;
if (ParseRawSurrogatePair(&lead, &trail)) {
builder->AddAtom(SurrogatePairAtom(alloc, lead, trail));
builder->AddAtom(SurrogatePairAtom(alloc, lead, trail, ignore_case_));
} else {
widechar c = current();
if (unicode::IsLeadSurrogate(c))

View File

@ -0,0 +1,28 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- ignoreCase flag for CharacterClass.";
print(BUGNUMBER + ": " + summary);
assertEqArray(/[ABC]+/iu.exec("DCBAabcd"),
["CBAabc"]);
assertEqArray(/[A\u{10401}]+/iu.exec("A\u{10401}a\u{10429}"),
["A\u{10401}a\u{10429}"]);
assertEqArray(/[\u{10401}-\u{10404}\u{10408}-\u{1040B}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"),
["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]);
assertEqArray(/[\u{10401}-\u{10404}\u{10408}-\u{1040B}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"),
["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]);
assertEqArray(/[\u{10429}-\u{1042C}\u{10430}-\u{10433}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"),
["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]);
assertEqArray(/[\u{10429}-\u{1042C}\u{10430}-\u{10433}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"),
["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]);
assertEqArray(/[\u{10401}-\u{10404}\u{10430}-\u{10433}]+/iu.exec("\u{10400}\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}\u{1040C}"),
["\u{10401}\u{10402}\u{10403}\u{10404}\u{10408}\u{10409}\u{1040A}\u{1040B}"]);
assertEqArray(/[\u{10401}-\u{10404}\u{10430}-\u{10433}]+/iu.exec("\u{10428}\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}\u{10434}"),
["\u{10429}\u{1042A}\u{1042B}\u{1042C}\u{10430}\u{10431}\u{10432}\u{10433}"]);
if (typeof reportCompare === "function")
reportCompare(true, true);

View File

@ -2240,6 +2240,252 @@ test(0xff57,0xff37);
test(0xff58,0xff38);
test(0xff59,0xff39);
test(0xff5a,0xff3a);
test(0x10400,0x10428);
test(0x10401,0x10429);
test(0x10402,0x1042a);
test(0x10403,0x1042b);
test(0x10404,0x1042c);
test(0x10405,0x1042d);
test(0x10406,0x1042e);
test(0x10407,0x1042f);
test(0x10408,0x10430);
test(0x10409,0x10431);
test(0x1040a,0x10432);
test(0x1040b,0x10433);
test(0x1040c,0x10434);
test(0x1040d,0x10435);
test(0x1040e,0x10436);
test(0x1040f,0x10437);
test(0x10410,0x10438);
test(0x10411,0x10439);
test(0x10412,0x1043a);
test(0x10413,0x1043b);
test(0x10414,0x1043c);
test(0x10415,0x1043d);
test(0x10416,0x1043e);
test(0x10417,0x1043f);
test(0x10418,0x10440);
test(0x10419,0x10441);
test(0x1041a,0x10442);
test(0x1041b,0x10443);
test(0x1041c,0x10444);
test(0x1041d,0x10445);
test(0x1041e,0x10446);
test(0x1041f,0x10447);
test(0x10420,0x10448);
test(0x10421,0x10449);
test(0x10422,0x1044a);
test(0x10423,0x1044b);
test(0x10424,0x1044c);
test(0x10425,0x1044d);
test(0x10426,0x1044e);
test(0x10427,0x1044f);
test(0x10428,0x10400);
test(0x10429,0x10401);
test(0x1042a,0x10402);
test(0x1042b,0x10403);
test(0x1042c,0x10404);
test(0x1042d,0x10405);
test(0x1042e,0x10406);
test(0x1042f,0x10407);
test(0x10430,0x10408);
test(0x10431,0x10409);
test(0x10432,0x1040a);
test(0x10433,0x1040b);
test(0x10434,0x1040c);
test(0x10435,0x1040d);
test(0x10436,0x1040e);
test(0x10437,0x1040f);
test(0x10438,0x10410);
test(0x10439,0x10411);
test(0x1043a,0x10412);
test(0x1043b,0x10413);
test(0x1043c,0x10414);
test(0x1043d,0x10415);
test(0x1043e,0x10416);
test(0x1043f,0x10417);
test(0x10440,0x10418);
test(0x10441,0x10419);
test(0x10442,0x1041a);
test(0x10443,0x1041b);
test(0x10444,0x1041c);
test(0x10445,0x1041d);
test(0x10446,0x1041e);
test(0x10447,0x1041f);
test(0x10448,0x10420);
test(0x10449,0x10421);
test(0x1044a,0x10422);
test(0x1044b,0x10423);
test(0x1044c,0x10424);
test(0x1044d,0x10425);
test(0x1044e,0x10426);
test(0x1044f,0x10427);
test(0x10c80,0x10cc0);
test(0x10c81,0x10cc1);
test(0x10c82,0x10cc2);
test(0x10c83,0x10cc3);
test(0x10c84,0x10cc4);
test(0x10c85,0x10cc5);
test(0x10c86,0x10cc6);
test(0x10c87,0x10cc7);
test(0x10c88,0x10cc8);
test(0x10c89,0x10cc9);
test(0x10c8a,0x10cca);
test(0x10c8b,0x10ccb);
test(0x10c8c,0x10ccc);
test(0x10c8d,0x10ccd);
test(0x10c8e,0x10cce);
test(0x10c8f,0x10ccf);
test(0x10c90,0x10cd0);
test(0x10c91,0x10cd1);
test(0x10c92,0x10cd2);
test(0x10c93,0x10cd3);
test(0x10c94,0x10cd4);
test(0x10c95,0x10cd5);
test(0x10c96,0x10cd6);
test(0x10c97,0x10cd7);
test(0x10c98,0x10cd8);
test(0x10c99,0x10cd9);
test(0x10c9a,0x10cda);
test(0x10c9b,0x10cdb);
test(0x10c9c,0x10cdc);
test(0x10c9d,0x10cdd);
test(0x10c9e,0x10cde);
test(0x10c9f,0x10cdf);
test(0x10ca0,0x10ce0);
test(0x10ca1,0x10ce1);
test(0x10ca2,0x10ce2);
test(0x10ca3,0x10ce3);
test(0x10ca4,0x10ce4);
test(0x10ca5,0x10ce5);
test(0x10ca6,0x10ce6);
test(0x10ca7,0x10ce7);
test(0x10ca8,0x10ce8);
test(0x10ca9,0x10ce9);
test(0x10caa,0x10cea);
test(0x10cab,0x10ceb);
test(0x10cac,0x10cec);
test(0x10cad,0x10ced);
test(0x10cae,0x10cee);
test(0x10caf,0x10cef);
test(0x10cb0,0x10cf0);
test(0x10cb1,0x10cf1);
test(0x10cb2,0x10cf2);
test(0x10cc0,0x10c80);
test(0x10cc1,0x10c81);
test(0x10cc2,0x10c82);
test(0x10cc3,0x10c83);
test(0x10cc4,0x10c84);
test(0x10cc5,0x10c85);
test(0x10cc6,0x10c86);
test(0x10cc7,0x10c87);
test(0x10cc8,0x10c88);
test(0x10cc9,0x10c89);
test(0x10cca,0x10c8a);
test(0x10ccb,0x10c8b);
test(0x10ccc,0x10c8c);
test(0x10ccd,0x10c8d);
test(0x10cce,0x10c8e);
test(0x10ccf,0x10c8f);
test(0x10cd0,0x10c90);
test(0x10cd1,0x10c91);
test(0x10cd2,0x10c92);
test(0x10cd3,0x10c93);
test(0x10cd4,0x10c94);
test(0x10cd5,0x10c95);
test(0x10cd6,0x10c96);
test(0x10cd7,0x10c97);
test(0x10cd8,0x10c98);
test(0x10cd9,0x10c99);
test(0x10cda,0x10c9a);
test(0x10cdb,0x10c9b);
test(0x10cdc,0x10c9c);
test(0x10cdd,0x10c9d);
test(0x10cde,0x10c9e);
test(0x10cdf,0x10c9f);
test(0x10ce0,0x10ca0);
test(0x10ce1,0x10ca1);
test(0x10ce2,0x10ca2);
test(0x10ce3,0x10ca3);
test(0x10ce4,0x10ca4);
test(0x10ce5,0x10ca5);
test(0x10ce6,0x10ca6);
test(0x10ce7,0x10ca7);
test(0x10ce8,0x10ca8);
test(0x10ce9,0x10ca9);
test(0x10cea,0x10caa);
test(0x10ceb,0x10cab);
test(0x10cec,0x10cac);
test(0x10ced,0x10cad);
test(0x10cee,0x10cae);
test(0x10cef,0x10caf);
test(0x10cf0,0x10cb0);
test(0x10cf1,0x10cb1);
test(0x10cf2,0x10cb2);
test(0x118a0,0x118c0);
test(0x118a1,0x118c1);
test(0x118a2,0x118c2);
test(0x118a3,0x118c3);
test(0x118a4,0x118c4);
test(0x118a5,0x118c5);
test(0x118a6,0x118c6);
test(0x118a7,0x118c7);
test(0x118a8,0x118c8);
test(0x118a9,0x118c9);
test(0x118aa,0x118ca);
test(0x118ab,0x118cb);
test(0x118ac,0x118cc);
test(0x118ad,0x118cd);
test(0x118ae,0x118ce);
test(0x118af,0x118cf);
test(0x118b0,0x118d0);
test(0x118b1,0x118d1);
test(0x118b2,0x118d2);
test(0x118b3,0x118d3);
test(0x118b4,0x118d4);
test(0x118b5,0x118d5);
test(0x118b6,0x118d6);
test(0x118b7,0x118d7);
test(0x118b8,0x118d8);
test(0x118b9,0x118d9);
test(0x118ba,0x118da);
test(0x118bb,0x118db);
test(0x118bc,0x118dc);
test(0x118bd,0x118dd);
test(0x118be,0x118de);
test(0x118bf,0x118df);
test(0x118c0,0x118a0);
test(0x118c1,0x118a1);
test(0x118c2,0x118a2);
test(0x118c3,0x118a3);
test(0x118c4,0x118a4);
test(0x118c5,0x118a5);
test(0x118c6,0x118a6);
test(0x118c7,0x118a7);
test(0x118c8,0x118a8);
test(0x118c9,0x118a9);
test(0x118ca,0x118aa);
test(0x118cb,0x118ab);
test(0x118cc,0x118ac);
test(0x118cd,0x118ad);
test(0x118ce,0x118ae);
test(0x118cf,0x118af);
test(0x118d0,0x118b0);
test(0x118d1,0x118b1);
test(0x118d2,0x118b2);
test(0x118d3,0x118b3);
test(0x118d4,0x118b4);
test(0x118d5,0x118b5);
test(0x118d6,0x118b6);
test(0x118d7,0x118b7);
test(0x118d8,0x118b8);
test(0x118d9,0x118b9);
test(0x118da,0x118ba);
test(0x118db,0x118bb);
test(0x118dc,0x118bc);
test(0x118dd,0x118bd);
test(0x118de,0x118be);
test(0x118df,0x118bf);
if (typeof reportCompare === "function")
reportCompare(true, true);

View File

@ -324,4 +324,12 @@ UTF16Decode(size_t lead, size_t trail)
} /* namespace unicode */
} /* namespace js */
#define FOR_EACH_NON_BMP_CASE_FOLDING(macro) \
macro(0x10400, 0x10427, 0xD801, 0xDC00, 0xDC27, 0x28) \
macro(0x10428, 0x1044F, 0xD801, 0xDC28, 0xDC4F, -0x28) \
macro(0x10C80, 0x10CB2, 0xD803, 0xDC80, 0xDCB2, 0x40) \
macro(0x10CC0, 0x10CF2, 0xD803, 0xDCC0, 0xDCF2, -0x40) \
macro(0x118A0, 0x118bf, 0xD806, 0xDCA0, 0xDCBF, 0x20) \
macro(0x118C0, 0x118df, 0xD806, 0xDCC0, 0xDCDF, -0x20)
#endif /* vm_Unicode_h */

View File

@ -177,9 +177,6 @@ def generate_unicode_stuff(unicode_data, case_folding,
folding_codes.add(mapping)
for code in sorted(folding_codes):
if code > MAX:
continue
if code in folding_map:
folding = folding_map[code]
else:
@ -200,6 +197,9 @@ def generate_unicode_stuff(unicode_data, case_folding,
item.append(folding)
folding_tests.append(item + rev_folding)
if code > MAX:
continue
folding_d = folding - code
rev_folding_ds = [v - code for v in rev_folding]