From 122e41a28c4c66f53d5e92694c37fe6b0ce3c78a Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Sat, 19 Dec 2015 04:51:21 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 11: Support back reference with unicode flag. r=till, f=anba --- js/src/irregexp/RegExpAST.h | 3 +- js/src/irregexp/RegExpEngine.cpp | 39 +++++++++++++++++++ js/src/irregexp/RegExpEngine.h | 7 +++- js/src/irregexp/RegExpParser.cpp | 20 +++++++++- .../ecma_6/RegExp/unicode-back-reference.js | 39 +++++++++++++++++++ 5 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-back-reference.js diff --git a/js/src/irregexp/RegExpAST.h b/js/src/irregexp/RegExpAST.h index 1bafe91d1..ae76f64d4 100644 --- a/js/src/irregexp/RegExpAST.h +++ b/js/src/irregexp/RegExpAST.h @@ -139,7 +139,8 @@ class RegExpAssertion : public RegExpTree { END_OF_INPUT, BOUNDARY, NON_BOUNDARY, - NOT_AFTER_LEAD_SURROGATE + NOT_AFTER_LEAD_SURROGATE, + NOT_IN_SURROGATE_PAIR }; explicit RegExpAssertion(AssertionType type) : assertion_type_(type) { } virtual void* Accept(RegExpVisitor* visitor, void* data); diff --git a/js/src/irregexp/RegExpEngine.cpp b/js/src/irregexp/RegExpEngine.cpp index 82bd34881..dfb29b0c2 100644 --- a/js/src/irregexp/RegExpEngine.cpp +++ b/js/src/irregexp/RegExpEngine.cpp @@ -2187,6 +2187,8 @@ RegExpAssertion::ToNode(RegExpCompiler* compiler, } case NOT_AFTER_LEAD_SURROGATE: return AssertionNode::NotAfterLeadSurrogate(on_success); + case NOT_IN_SURROGATE_PAIR: + return AssertionNode::NotInSurrogatePair(on_success); default: MOZ_CRASH("Bad assertion type"); } @@ -2999,6 +3001,40 @@ EmitNotAfterLeadSurrogate(RegExpCompiler* compiler, RegExpNode* on_success, Trac on_success->Emit(compiler, &new_trace); } +// Assert that the next character is not a trail surrogate that has a +// corresponding lead surrogate. +static void +EmitNotInSurrogatePair(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) +{ + RegExpMacroAssembler* assembler = compiler->macro_assembler(); + + jit::Label ok; + assembler->CheckPosition(trace->cp_offset(), &ok); + + // We will be loading the next and previous characters into the current + // character register. + Trace new_trace(*trace); + new_trace.InvalidateCurrentCharacter(); + + if (new_trace.cp_offset() == 0) + assembler->CheckAtStart(&ok); + + // First check if next character is a trail surrogate. + assembler->LoadCurrentCharacter(new_trace.cp_offset(), new_trace.backtrack(), false); + assembler->CheckCharacterNotInRange(unicode::TrailSurrogateMin, unicode::TrailSurrogateMax, + &ok); + + // Next check if previous character is a lead surrogate. + // We already checked that we are not at the start of input so it must be + // OK to load the previous character. + assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), false); + assembler->CheckCharacterInRange(unicode::LeadSurrogateMin, unicode::LeadSurrogateMax, + new_trace.backtrack()); + + assembler->Bind(&ok); + on_success->Emit(compiler, &new_trace); +} + // Check for [0-9A-Z_a-z]. static void EmitWordCheck(RegExpMacroAssembler* assembler, @@ -3155,6 +3191,9 @@ AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) case NOT_AFTER_LEAD_SURROGATE: EmitNotAfterLeadSurrogate(compiler, on_success(), trace); return; + case NOT_IN_SURROGATE_PAIR: + EmitNotInSurrogatePair(compiler, on_success(), trace); + return; } on_success()->Emit(compiler, trace); } diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h index 356366f61..ca315c2ad 100644 --- a/js/src/irregexp/RegExpEngine.h +++ b/js/src/irregexp/RegExpEngine.h @@ -792,7 +792,8 @@ class AssertionNode : public SeqRegExpNode AT_BOUNDARY, AT_NON_BOUNDARY, AFTER_NEWLINE, - NOT_AFTER_LEAD_SURROGATE + NOT_AFTER_LEAD_SURROGATE, + NOT_IN_SURROGATE_PAIR }; AssertionNode(AssertionType t, RegExpNode* on_success) : SeqRegExpNode(on_success), assertion_type_(t) @@ -817,6 +818,10 @@ class AssertionNode : public SeqRegExpNode return on_success->alloc()->newInfallible(NOT_AFTER_LEAD_SURROGATE, on_success); } + static AssertionNode* NotInSurrogatePair(RegExpNode* on_success) { + return on_success->alloc()->newInfallible(NOT_IN_SURROGATE_PAIR, + on_success); + } virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start); diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index f5ecda260..1874368a5 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -1374,6 +1374,21 @@ UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool igno return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false, false); } +static inline RegExpTree* +UnicodeBackReferenceAtom(LifoAlloc* alloc, RegExpTree* atom) +{ + // If a back reference has a standalone lead surrogate as its last + // character, then that lead surrogate shouldn't match lead surrogates that + // are paired with a corresponding trail surrogate. + RegExpBuilder* builder = alloc->newInfallible(alloc); + + builder->AddAtom(atom); + builder->AddAssertion(alloc->newInfallible( + RegExpAssertion::NOT_IN_SURROGATE_PAIR)); + + return builder->ToRegExp(); +} + // Disjunction :: // Alternative // Alternative | Disjunction @@ -1575,7 +1590,10 @@ RegExpParser::ParseDisjunction() break; } RegExpTree* atom = alloc->newInfallible(capture); - builder->AddAtom(atom); + if (unicode_) + builder->AddAtom(UnicodeBackReferenceAtom(alloc, atom)); + else + builder->AddAtom(atom); break; } if (unicode_) diff --git a/js/src/tests/ecma_6/RegExp/unicode-back-reference.js b/js/src/tests/ecma_6/RegExp/unicode-back-reference.js new file mode 100644 index 000000000..2a65432a1 --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-back-reference.js @@ -0,0 +1,39 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- back reference should not match lead surrogate that has corresponding trail surrogate."; + +print(BUGNUMBER + ": " + summary); + +// The last character of back reference is not a surrogate. +assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uDC00"), + ["fooAbarA", "A"]); +assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA\uD834"), + ["fooAbarA", "A"]); +assertEqArray(/foo(.+)bar\1/u.exec("fooAbarAA"), + ["fooAbarA", "A"]); +assertEqArray(/foo(.+)bar\1/u.exec("fooAbarA"), + ["fooAbarA", "A"]); + +// The last character of back reference is a lead surrogate. +assertEq(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uDC00"), null); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834\uD834"), + ["foo\uD834bar\uD834", "\uD834"]); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834A"), + ["foo\uD834bar\uD834", "\uD834"]); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uD834bar\uD834"), + ["foo\uD834bar\uD834", "\uD834"]); + +// The last character of back reference is a trail surrogate. +assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uDC00"), + ["foo\uDC00bar\uDC00", "\uDC00"]); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00\uD834"), + ["foo\uDC00bar\uDC00", "\uDC00"]); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00A"), + ["foo\uDC00bar\uDC00", "\uDC00"]); +assertEqArray(/foo(.+)bar\1/u.exec("foo\uDC00bar\uDC00"), + ["foo\uDC00bar\uDC00", "\uDC00"]); + +// Pattern should not match to surrogate pair partially. +assertEq(/^(.+)\1$/u.exec("\uDC00foobar\uD834\uDC00foobar\uD834"), null); + +if (typeof reportCompare === "function") + reportCompare(true, true);