#393, Bug 1135377 - Part 4: Support everything Atom in RegExp with unicode flag. r=till, f=anba

This commit is contained in:
Tooru Fujisawa 2015-08-07 08:11:52 +09:00 committed by Cameron Kaiser
parent 68f44ec410
commit 4e924a688e
2 changed files with 98 additions and 0 deletions

View File

@ -1178,6 +1178,41 @@ TrailSurrogateAtom(LifoAlloc* alloc, char16_t value)
return builder->ToRegExp();
}
static inline RegExpTree*
UnicodeEverythingAtom(LifoAlloc* alloc)
{
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
// everything except \x0a, \x0d, \u2028 and \u2029
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
ranges->append(CharacterRange::Range(0x0, 0x09));
ranges->append(CharacterRange::Range(0x0b, 0x0c));
ranges->append(CharacterRange::Range(0x0e, 0x2027));
ranges->append(CharacterRange::Range(0x202A, unicode::LeadSurrogateMin - 1));
ranges->append(CharacterRange::Range(unicode::TrailSurrogateMax + 1, unicode::UTF16Max));
builder->AddAtom(alloc->newInfallible<RegExpCharacterClass>(ranges, false));
builder->NewAlternative();
builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
builder->AddAtom(NegativeLookahead(alloc, unicode::TrailSurrogateMin,
unicode::TrailSurrogateMax));
builder->NewAlternative();
builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
RegExpAssertion::NOT_AFTER_LEAD_SURROGATE));
builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
builder->NewAlternative();
builder->AddAtom(RangeAtom(alloc, unicode::LeadSurrogateMin, unicode::LeadSurrogateMax));
builder->AddAtom(RangeAtom(alloc, unicode::TrailSurrogateMin, unicode::TrailSurrogateMax));
return builder->ToRegExp();
}
// Disjunction ::
// Alternative
// Alternative | Disjunction
@ -1275,6 +1310,10 @@ RegExpParser<CharT>::ParseDisjunction()
case '.': {
Advance();
// everything except \x0a, \x0d, \u2028 and \u2029
if (unicode_) {
builder->AddAtom(UnicodeEverythingAtom(alloc));
break;
}
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRange::AddClassEscape(alloc, '.', ranges);
RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);

View File

@ -0,0 +1,59 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- everything Atom.";
print(BUGNUMBER + ": " + summary);
// ==== standalone ====
assertEqArray(/./u.exec("ABC"),
["A"]);
assertEqArray(/./u.exec("\u{1F438}BC"),
["\u{1F438}"]);
assertEqArray(/./u.exec("\uD83D\uDBFF"),
["\uD83D"]);
assertEqArray(/./u.exec("\uD83D\uDC00"),
["\uD83D\uDC00"]);
assertEqArray(/./u.exec("\uD83D\uDFFF"),
["\uD83D\uDFFF"]);
assertEqArray(/./u.exec("\uD83D\uE000"),
["\uD83D"]);
assertEqArray(/./u.exec("\uD83D"),
["\uD83D"]);
assertEqArray(/./u.exec("\uD83DA"),
["\uD83D"]);
assertEqArray(/./u.exec("\uD7FF\uDC38"),
["\uD7FF"]);
assertEqArray(/./u.exec("\uD800\uDC38"),
["\uD800\uDC38"]);
assertEqArray(/./u.exec("\uDBFF\uDC38"),
["\uDBFF\uDC38"]);
assertEqArray(/./u.exec("\uDC00\uDC38"),
["\uDC00"]);
assertEqArray(/./u.exec("\uDC38"),
["\uDC38"]);
assertEqArray(/./u.exec("A\uDC38"),
["A"]);
assertEqArray(/.A/u.exec("\uD7FF\uDC38A"),
["\uDC38A"]);
assertEqArray(/.A/u.exec("\uD800\uDC38A"),
["\uD800\uDC38A"]);
assertEqArray(/.A/u.exec("\uDBFF\uDC38A"),
["\uDBFF\uDC38A"]);
assertEqArray(/.A/u.exec("\uDC00\uDC38A"),
["\uDC38A"]);
// ==== leading multiple ====
assertEqArray(/.*A/u.exec("\u{1F438}\u{1F438}\u{1F438}A"),
["\u{1F438}\u{1F438}\u{1F438}A"]);
// ==== trailing multiple ====
assertEqArray(/A.*/u.exec("A\u{1F438}\u{1F438}\u{1F438}"),
["A\u{1F438}\u{1F438}\u{1F438}"]);
if (typeof reportCompare === "function")
reportCompare(true, true);