mirror of
https://github.com/classilla/tenfourfox.git
synced 2024-10-08 09:58:07 +00:00
#393, Bug 1135377 - Part 11: Support back reference with unicode flag. r=till, f=anba
This commit is contained in:
parent
522d06ab34
commit
122e41a28c
@ -139,7 +139,8 @@ class RegExpAssertion : public RegExpTree {
|
||||
END_OF_INPUT,
|
||||
BOUNDARY,
|
||||
NON_BOUNDARY,
|
||||
NOT_AFTER_LEAD_SURROGATE
|
||||
NOT_AFTER_LEAD_SURROGATE,
|
||||
NOT_IN_SURROGATE_PAIR
|
||||
};
|
||||
explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { }
|
||||
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
||||
|
@ -2187,6 +2187,8 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler,
|
||||
}
|
||||
case NOT_AFTER_LEAD_SURROGATE:
|
||||
return AssertionNode::NotAfterLeadSurrogate(on_success);
|
||||
case NOT_IN_SURROGATE_PAIR:
|
||||
return AssertionNode::NotInSurrogatePair(on_success);
|
||||
default:
|
||||
MOZ_CRASH("Bad assertion type");
|
||||
}
|
||||
@ -2999,6 +3001,40 @@ EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trac
|
||||
on_success->Emit(compiler, &new_trace);
|
||||
}
|
||||
|
||||
// Assert that the next character is not a trail surrogate that has a
|
||||
// corresponding lead surrogate.
|
||||
static void
|
||||
EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace)
|
||||
{
|
||||
RegExpMacroAssembler* assembler = compiler->macro_assembler();
|
||||
|
||||
jit::Label ok;
|
||||
assembler->CheckPosition(trace->cp_offset(), &ok);
|
||||
|
||||
// We will be loading the next and previous characters into the current
|
||||
// character register.
|
||||
Trace new_trace(*trace);
|
||||
new_trace.InvalidateCurrentCharacter();
|
||||
|
||||
if (new_trace.cp_offset() == 0)
|
||||
assembler->CheckAtStart(&ok);
|
||||
|
||||
// First check if next character is a trail surrogate.
|
||||
assembler->LoadCurrentCharacter(new_trace.cp_offset(), new_trace.backtrack(), false);
|
||||
assembler->CheckCharacterNotInRange(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax,
|
||||
&ok);
|
||||
|
||||
// Next check if previous character is a lead surrogate.
|
||||
// We already checked that we are not at the start of input so it must be
|
||||
// OK to load the previous character.
|
||||
assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false);
|
||||
assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax,
|
||||
new_trace.backtrack());
|
||||
|
||||
assembler->Bind(&ok);
|
||||
on_success->Emit(compiler, &new_trace);
|
||||
}
|
||||
|
||||
// Check for [0-9A-Z_a-z].
|
||||
static void
|
||||
EmitWordCheck(RegExpMacroAssembler* assembler,
|
||||
@ -3155,6 +3191,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace)
|
||||
case NOT_AFTER_LEAD_SURROGATE:
|
||||
EmitNotAfterLeadSurrogate(compiler, on_success(), trace);
|
||||
return;
|
||||
case NOT_IN_SURROGATE_PAIR:
|
||||
EmitNotInSurrogatePair(compiler, on_success(), trace);
|
||||
return;
|
||||
}
|
||||
on_success()->Emit(compiler, trace);
|
||||
}
|
||||
|
@ -792,7 +792,8 @@ class AssertionNode : public SeqRegExpNode
|
||||
AT_BOUNDARY,
|
||||
AT_NON_BOUNDARY,
|
||||
AFTER_NEWLINE,
|
||||
NOT_AFTER_LEAD_SURROGATE
|
||||
NOT_AFTER_LEAD_SURROGATE,
|
||||
NOT_IN_SURROGATE_PAIR
|
||||
};
|
||||
AssertionNode(AssertionType t, RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success), assertion_type_(t)
|
||||
@ -817,6 +818,10 @@ class AssertionNode : public SeqRegExpNode
|
||||
return on_success->alloc()->newInfallible<AssertionNode>(NOT_AFTER_LEAD_SURROGATE,
|
||||
on_success);
|
||||
}
|
||||
static AssertionNode* NotInSurrogatePair(RegExpNode* on_success) {
|
||||
return on_success->alloc()->newInfallible<AssertionNode>(NOT_IN_SURROGATE_PAIR,
|
||||
on_success);
|
||||
}
|
||||
virtual void Accept(NodeVisitor* visitor);
|
||||
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
|
||||
virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
|
||||
|
@ -1374,6 +1374,21 @@ UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool igno
|
||||
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false);
|
||||
}
|
||||
|
||||
static inline RegExpTree*
|
||||
UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom)
|
||||
{
|
||||
// If a back reference has a standalone lead surrogate as its last
|
||||
// character, then that lead surrogate shouldn't match lead surrogates that
|
||||
// are paired with a corresponding trail surrogate.
|
||||
RegExpBuilder* builder = alloc->newInfallible<RegExpBuilder>(alloc);
|
||||
|
||||
builder->AddAtom(atom);
|
||||
builder->AddAssertion(alloc->newInfallible<RegExpAssertion>(
|
||||
RegExpAssertion::NOT_IN_SURROGATE_PAIR));
|
||||
|
||||
return builder->ToRegExp();
|
||||
}
|
||||
|
||||
// Disjunction ::
|
||||
// Alternative
|
||||
// Alternative | Disjunction
|
||||
@ -1575,6 +1590,9 @@ RegExpParser<CharT>::ParseDisjunction()
|
||||
break;
|
||||
}
|
||||
RegExpTree* atom = alloc->newInfallible<RegExpBackReference>(capture);
|
||||
if (unicode_)
|
||||
builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom));
|
||||
else
|
||||
builder->AddAtom(atom);
|
||||
break;
|
||||
}
|
||||
|
39
js/src/tests/ecma_6/RegExp/unicode-back-reference.js
Normal file
39
js/src/tests/ecma_6/RegExp/unicode-back-reference.js
Normal file
@ -0,0 +1,39 @@
|
||||
var BUGNUMBER = 1135377;
|
||||
var summary = "Implement RegExp unicode flag -- back reference should not match lead surrogate that has corresponding trail surrogate.";
|
||||
|
||||
print(BUGNUMBER + ": " + summary);
|
||||
|
||||
// The last character of back reference is not a surrogate.
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uDC00"),
|
||||
["fooAbarA", "A"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uD834"),
|
||||
["fooAbarA", "A"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarAA"),
|
||||
["fooAbarA", "A"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA"),
|
||||
["fooAbarA", "A"]);
|
||||
|
||||
// The last character of back reference is a lead surrogate.
|
||||
assertEq(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uDC00"), null);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uD834"),
|
||||
["foo\uD834bar\uD834", "\uD834"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834A"),
|
||||
["foo\uD834bar\uD834", "\uD834"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834"),
|
||||
["foo\uD834bar\uD834", "\uD834"]);
|
||||
|
||||
// The last character of back reference is a trail surrogate.
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uDC00"),
|
||||
["foo\uDC00bar\uDC00", "\uDC00"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uD834"),
|
||||
["foo\uDC00bar\uDC00", "\uDC00"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00A"),
|
||||
["foo\uDC00bar\uDC00", "\uDC00"]);
|
||||
assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00"),
|
||||
["foo\uDC00bar\uDC00", "\uDC00"]);
|
||||
|
||||
// Pattern should not match to surrogate pair partially.
|
||||
assertEq(/^(.+)\1$/u.exec("\uDC00foobar\uD834\uDC00foobar\uD834"), null);
|
||||
|
||||
if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
Loading…
Reference in New Issue
Block a user