From 522d06ab342064fecf88cc68e53795561204ecc1 Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Sat, 19 Dec 2015 04:51:20 +0900 Subject: [PATCH] #393, Bug 1135377 - Part 10: Decrement index when it points trail surrogate that has corresponding lead surrogate. r=till, f=anba --- js/src/builtin/RegExp.cpp | 52 +++++++++++++++++++++ js/src/tests/ecma_6/RegExp/unicode-index.js | 17 +++++++ 2 files changed, 69 insertions(+) create mode 100644 js/src/tests/ecma_6/RegExp/unicode-index.js diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp index 909f58a0a..ea9aa17de 100644 --- a/js/src/builtin/RegExp.cpp +++ b/js/src/builtin/RegExp.cpp @@ -14,12 +14,14 @@ #include "jit/InlinableNatives.h" #include "vm/RegExpStatics.h" #include "vm/StringBuffer.h" +#include "vm/Unicode.h" #include "jsobjinlines.h" #include "vm/NativeObject-inl.h" using namespace js; +using namespace js::unicode; using mozilla::ArrayLength; using mozilla::Maybe; @@ -758,6 +760,29 @@ SetLastIndex(JSContext* cx, Handle reobj, double lastIndex) return true; } +template +static bool +IsTrailSurrogateWithLeadSurrogateImpl(JSContext* cx, HandleLinearString input, size_t index) +{ + JS::AutoCheckCannotGC nogc; + MOZ_ASSERT(index > 0 && index < input->length()); + const CharT* inputChars = input->chars(nogc); + + return unicode::IsTrailSurrogate(inputChars[index]) && + unicode::IsLeadSurrogate(inputChars[index - 1]); +} + +static bool +IsTrailSurrogateWithLeadSurrogate(JSContext* cx, HandleLinearString input, int32_t index) +{ + if (index <= 0 || size_t(index) >= input->length()) + return false; + + return input->hasLatin1Chars() + ? IsTrailSurrogateWithLeadSurrogateImpl(cx, input, index) + : IsTrailSurrogateWithLeadSurrogateImpl(cx, input, index); +} + /* ES6 final draft 21.2.5.2.2. */ RegExpRunStatus js::ExecuteRegExp(JSContext* cx, HandleObject regexp, HandleString string, @@ -840,6 +865,33 @@ js::ExecuteRegExp(JSContext* cx, HandleObject regexp, HandleString string, return RegExpRunStatus_Success_NotFound; } + /* Steps 12-13. */ + if (reobj->unicode()) { + /* + * ES6 21.2.2.2 step 2. + * Let listIndex be the index into Input of the character that was + * obtained from element index of str. + * + * In the spec, pattern match is performed with decoded Unicode code + * points, but our implementation performs it with UTF-16 encoded + * string. In step 2, we should decrement searchIndex (index) if it + * points the trail surrogate that has corresponding lead surrogate. + * + * var r = /\uD83D\uDC38/ug; + * r.lastIndex = 1; + * var str = "\uD83D\uDC38"; + * var result = r.exec(str); // pattern match starts from index 0 + * print(result.index); // prints 0 + * + * Note: this doesn't match the current spec text and result in + * different values for `result.index` under certain conditions. + * However, the spec will change to match our implementation's + * behavior. See https://github.com/tc39/ecma262/issues/128. + */ + if (IsTrailSurrogateWithLeadSurrogate(cx, input, searchIndex)) + searchIndex--; + } + /* Step 14-29. */ RegExpRunStatus status = ExecuteRegExpImpl(cx, res, *re, input, searchIndex, matches); if (status == RegExpRunStatus_Error) diff --git a/js/src/tests/ecma_6/RegExp/unicode-index.js b/js/src/tests/ecma_6/RegExp/unicode-index.js new file mode 100644 index 000000000..a4b2eb203 --- /dev/null +++ b/js/src/tests/ecma_6/RegExp/unicode-index.js @@ -0,0 +1,17 @@ +var BUGNUMBER = 1135377; +var summary = "Implement RegExp unicode flag -- Pattern match should start from lead surrogate when lastIndex points corresponding trail surrogate."; + +print(BUGNUMBER + ": " + summary); + +var r = /\uD83D\uDC38/ug; +r.lastIndex = 1; +var str = "\uD83D\uDC38"; +var result = r.exec(str); +assertEq(result.length, 1); +assertEq(result[0], "\uD83D\uDC38"); + +// This does not match to ES6 spec, but the spec will be changed. +assertEq(result.index, 0); + +if (typeof reportCompare === "function") + reportCompare(true, true);