#393, Bug 1135377 - Part 11: Support back reference with unicode flag. r=till, f=anba

This commit is contained in:
Tooru Fujisawa 2015-12-19 04:51:21 +09:00 committed by Cameron Kaiser
parent 522d06ab34
commit 122e41a28c
5 changed files with 105 additions and 3 deletions

View File

@ -139,7 +139,8 @@ class RegExpAssertion : public RegExpTree {
END_OF_INPUT,
BOUNDARY,
NON_BOUNDARY,
NOT_AFTER_LEAD_SURROGATE
NOT_AFTER_LEAD_SURROGATE,
NOT_IN_SURROGATE_PAIR
};
explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { }
virtual void* Accept(RegExpVisitor* visitor, void* data);

View File

@ -2187,6 +2187,8 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler,
}
case NOT_AFTER_LEAD_SURROGATE:
return AssertionNode::NotAfterLeadSurrogate(on_success);
case NOT_IN_SURROGATE_PAIR:
return AssertionNode::NotInSurrogatePair(on_success);
default:
MOZ_CRASH("Bad assertion type");
}
@ -2999,6 +3001,40 @@ EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trac
on_success->Emit(compiler, &new_trace);
}
// Assert that the next character is not a trail surrogate that has a
// corresponding lead surrogate.
static void
EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
{
RegExpMacroAssembler* assembler = compiler->macro_assembler();
jit::Label ok;
assembler->CheckPosition(trace->cp_offset(), &ok);
// We will be loading the next and previous characters into the current
// character register.
Trace new_trace(*trace);
new_trace.InvalidateCurrentCharacter();
if (new_trace.cp_offset() == 0)
assembler->CheckAtStart(&ok);
// First check if next character is a trail surrogate.
assembler->LoadCurrentCharacter(new_trace.cp_offset(), new_trace.backtrack(), false);
assembler->CheckCharacterNotInRange(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax,
&ok);
// Next check if previous character is a lead surrogate.
// We already checked that we are not at the start of input so it must be
// OK to load the previous character.
assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
new_trace.backtrack());
assembler->Bind(&ok);
on_success->Emit(compiler, &new_trace);
}
// Check for [0-9A-Z_a-z].
static void
EmitWordCheck(RegExpMacroAssembler* assembler,
@ -3155,6 +3191,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace)
case NOT_AFTER_LEAD_SURROGATE:
EmitNotAfterLeadSurrogate(compiler, on_success(), trace);
return;
case NOT_IN_SURROGATE_PAIR:
EmitNotInSurrogatePair(compiler, on_success(), trace);
return;
}
on_success()->Emit(compiler, trace);
}

View File

@ -792,7 +792,8 @@ class AssertionNode : public SeqRegExpNode
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE,
NOT_AFTER_LEAD_SURROGATE
NOT_AFTER_LEAD_SURROGATE,
NOT_IN_SURROGATE_PAIR
};
AssertionNode(AssertionType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), assertion_type_(t)
@ -817,6 +818,10 @@ class AssertionNode : public SeqRegExpNode
return on_success->alloc()->newInfallible<AssertionNode>(NOT_AFTER_LEAD_SURROGATE,
on_success);
}
static AssertionNode* NotInSurrogatePair(RegExpNode* on_success) {
return on_success->alloc()->newInfallible<AssertionNode>(NOT_IN_SURROGATE_PAIR,
on_success);
}
virtual void Accept(NodeVisitor* visitor);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);

View File

@ -1374,6 +1374,21 @@ UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool igno
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
}
static inline RegExpTree*
UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
{
// If a back reference has a standalone lead surrogate as its last
// character, then that lead surrogate shouldn't match lead surrogates that
// are paired with a corresponding trail surrogate.
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
builder->AddAtom(atom);
builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
RegExpAssertion::NOT_IN_SURROGATE_PAIR));
return builder->ToRegExp();
}
// Disjunction ::
// Alternative
// Alternative | Disjunction
@ -1575,7 +1590,10 @@ RegExpParser<CharT>::ParseDisjunction()
break;
}
RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
builder->AddAtom(atom);
if (unicode_)
builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
else
builder->AddAtom(atom);
break;
}
if (unicode_)

View File

@ -0,0 +1,39 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- back reference should not match lead surrogate that has corresponding trail surrogate.";
print(BUGNUMBER + ": " + summary);
// The last character of back reference is not a surrogate.
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uDC00"),
["fooAbarA", "A"]);
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uD834"),
["fooAbarA", "A"]);
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarAA"),
["fooAbarA", "A"]);
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA"),
["fooAbarA", "A"]);
// The last character of back reference is a lead surrogate.
assertEq(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uDC00"), null);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uD834"),
["foo\uD834bar\uD834", "\uD834"]);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834A"),
["foo\uD834bar\uD834", "\uD834"]);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834"),
["foo\uD834bar\uD834", "\uD834"]);
// The last character of back reference is a trail surrogate.
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uDC00"),
["foo\uDC00bar\uDC00", "\uDC00"]);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uD834"),
["foo\uDC00bar\uDC00", "\uDC00"]);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00A"),
["foo\uDC00bar\uDC00", "\uDC00"]);
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00"),
["foo\uDC00bar\uDC00", "\uDC00"]);
// Pattern should not match to surrogate pair partially.
assertEq(/^(.+)\1$/u.exec("\uDC00foobar\uD834\uDC00foobar\uD834"), null);
if (typeof reportCompare === "function")
reportCompare(true, true);