From c05db4075d27189cb653cea2553c48333d5a879e Mon Sep 17 00:00:00 2001
From: Tooru Fujisawa <arai_a@mac.com>
Date: Fri, 7 Aug 2015 08:12:51 +0900
Subject: [PATCH] #393, Bug 1135377 - Part 8: Disallow extended pattern in
 RegExp with unicode flag. r=till, f=anba

---
 js/src/irregexp/RegExpParser.cpp              |  95 ++++++++++++--
 js/src/js.msg                                 |   4 +
 .../RegExp/unicode-disallow-extended.js       | 117 ++++++++++++++++++
 js/src/vm/Xdr.h                               |   4 +-
 4 files changed, 207 insertions(+), 13 deletions(-)
 create mode 100644 js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
index 52ee9942c..f5ecda260 100644
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -405,6 +405,31 @@ NegativeLookahead(LifoAlloc* alloc, char16_t from, char16_t to)
     return alloc->newInfallible<RegExpLookahead>(RangeAtom(alloc, from, to), false, 0, 0);
 }
 
+static bool
+IsSyntaxCharacter(widechar c)
+{
+  switch (c) {
+    case '^':
+    case '$':
+    case '\\':
+    case '.':
+    case '*':
+    case '+':
+    case '?':
+    case '(':
+    case ')':
+    case '[':
+    case ']':
+    case '{':
+    case '}':
+    case '|':
+    case '/':
+      return true;
+    default:
+      return false;
+  }
+}
+
 #ifdef DEBUG
 // Currently only used in an assert.kASSERT.
 static bool
@@ -459,16 +484,23 @@ RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
         widechar controlLetter = Next();
         widechar letter = controlLetter & ~('A' ^ 'a');
         // For compatibility with JSC, inside a character class
-        // we also accept digits and underscore as control characters.
-        if ((controlLetter >= '0' && controlLetter <= '9') ||
-            controlLetter == '_' ||
-            (letter >= 'A' && letter <= 'Z')) {
+        // we also accept digits and underscore as control characters,
+        // but only in non-unicode mode
+        if ((!unicode_ &&
+             ((controlLetter >= '0' && controlLetter <= '9') ||
+              controlLetter == '_')) ||
+            (letter >= 'A' && letter <= 'Z'))
+        {
             Advance(2);
             // Control letters mapped to ASCII control characters in the range
             // 0x00-0x1f.
             *code = controlLetter & 0x1f;
             return true;
         }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         // We match JSC in reading the backslash as a literal
         // character instead of as starting an escape.
         *code = '\\';
@@ -476,9 +508,18 @@ RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
       }
       case '0': case '1': case '2': case '3': case '4': case '5':
       case '6': case '7':
-        // For compatibility, we interpret a decimal escape that isn't
-        // a back reference (and therefore either \0 or not valid according
-        // to the specification) as a 1..3 digit octal character code.
+        if (unicode_) {
+            if (current() == '0') {
+                *code = 0;
+                return true;
+            }
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
+        // For compatibility, outside of unicode mode, we interpret a decimal
+        // escape that isn't a back reference (and therefore either \0 or not
+        // valid according to the specification) as a 1..3 digit octal
+        // character code.
         *code = ParseOctalLiteral();
         return true;
       case 'x': {
@@ -488,8 +529,12 @@ RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
             *code = value;
             return true;
         }
+        if (unicode_) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         // If \x is not followed by a two-digit hexadecimal, treat it
-        // as an identity escape.
+        // as an identity escape in non-unicode mode.
         *code = 'x';
         return true;
       }
@@ -527,10 +572,14 @@ RegExpParser<CharT>::ParseClassCharacterEscape(widechar* code)
         return true;
       }
       default: {
-        // Extended identity escape. We accept any character that hasn't
-        // been matched by a more specific case, not just the subset required
-        // by the ECMAScript specification.
+        // Extended identity escape (non-unicode only). We accept any character
+        // that hasn't been matched by a more specific case, not just the subset
+        // required by the ECMAScript specification.
         widechar result = current();
+        if (unicode_ && result != '-' && !IsSyntaxCharacter(result)) {
+            ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+            return false;
+        }
         Advance();
         *code = result;
         return true;
@@ -1388,6 +1437,8 @@ RegExpParser<CharT>::ParseDisjunction()
                                                    capture_index);
             }
             builder->AddAtom(body);
+            if (unicode_ && (group_type == POSITIVE_LOOKAHEAD || group_type == NEGATIVE_LOOKAHEAD))
+                continue;
             // For compatability with JSC and ES3, we allow quantifiers after
             // lookaheads, and break in all cases.
             break;
@@ -1527,6 +1578,8 @@ RegExpParser<CharT>::ParseDisjunction()
                     builder->AddAtom(atom);
                     break;
                 }
+                if (unicode_)
+                    return ReportError(JSMSG_BACK_REF_OUT_OF_RANGE);
                 widechar first_digit = Next();
                 if (first_digit == '8' || first_digit == '9') {
                     // Treat as identity escape
@@ -1537,6 +1590,14 @@ RegExpParser<CharT>::ParseDisjunction()
               }
                 // FALLTHROUGH
               case '0': {
+                if (unicode_) {
+                    Advance(2);
+                    if (IsDecimalDigit(current()))
+                        return ReportError(JSMSG_INVALID_DECIMAL_ESCAPE);
+                    builder->AddCharacter(0);
+                    break;
+                }
+
                 Advance();
                 size_t octal = ParseOctalLiteral();
                 builder->AddCharacter(octal);
@@ -1571,6 +1632,8 @@ RegExpParser<CharT>::ParseDisjunction()
                 // Convert lower case letters to uppercase.
                 widechar letter = controlLetter & ~('a' ^ 'A');
                 if (letter < 'A' || 'Z' < letter) {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                     // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
                     // This is outside the specification. We match JSC in
                     // reading the backslash as a literal character instead
@@ -1588,6 +1651,8 @@ RegExpParser<CharT>::ParseDisjunction()
                 if (ParseHexEscape(2, &value)) {
                     builder->AddCharacter(value);
                 } else {
+                    if (unicode_)
+                        return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                     builder->AddCharacter('x');
                 }
                 break;
@@ -1639,12 +1704,16 @@ RegExpParser<CharT>::ParseDisjunction()
               }
               default:
                 // Identity escape.
+                if (unicode_ && !IsSyntaxCharacter(Next()))
+                    return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
                 builder->AddCharacter(Next());
                 Advance(2);
                 break;
             }
             break;
           case '{': {
+            if (unicode_)
+                return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
             int dummy;
             if (ParseIntervalQuantifier(&dummy, &dummy))
                 return ReportError(JSMSG_NOTHING_TO_REPEAT);
@@ -1661,6 +1730,10 @@ RegExpParser<CharT>::ParseDisjunction()
                         builder->AddAtom(LeadSurrogateAtom(alloc, c));
                     else if (unicode::IsTrailSurrogate(c))
                         builder->AddAtom(TrailSurrogateAtom(alloc, c));
+                    else if (c == ']')
+                        return ReportError(JSMSG_RAW_BRACKET_IN_REGEP);
+                    else if (c == '}')
+                        return ReportError(JSMSG_RAW_BRACE_IN_REGEP);
                     else
                         builder->AddCharacter(c);
                     Advance();
diff --git a/js/src/js.msg b/js/src/js.msg
index 5b163ef3a..a8e94a693 100644
--- a/js/src/js.msg
+++ b/js/src/js.msg
@@ -447,8 +447,10 @@ MSG_DEF(JSMSG_INVALID_TIME_ZONE,       1, JSEXN_RANGEERR, "invalid time zone in
 MSG_DEF(JSMSG_UNDEFINED_CURRENCY,      0, JSEXN_TYPEERR, "undefined currency in NumberFormat() with currency style")
 
 // RegExp
+MSG_DEF(JSMSG_BACK_REF_OUT_OF_RANGE,   0, JSEXN_SYNTAXERR, "back reference out of range in regular expression")
 MSG_DEF(JSMSG_BAD_CLASS_RANGE,         0, JSEXN_SYNTAXERR, "invalid range in character class")
 MSG_DEF(JSMSG_ESCAPE_AT_END_OF_REGEXP, 0, JSEXN_SYNTAXERR, "\\ at end of pattern")
+MSG_DEF(JSMSG_INVALID_DECIMAL_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid decimal escape in regular expression")
 MSG_DEF(JSMSG_INVALID_GROUP,           0, JSEXN_SYNTAXERR, "invalid regexp group")
 MSG_DEF(JSMSG_INVALID_IDENTITY_ESCAPE, 0, JSEXN_SYNTAXERR, "invalid identity escape in regular expression")
 MSG_DEF(JSMSG_INVALID_UNICODE_ESCAPE,  0, JSEXN_SYNTAXERR, "invalid unicode escape in regular expression")
@@ -457,6 +459,8 @@ MSG_DEF(JSMSG_NEWREGEXP_FLAGGED,       0, JSEXN_TYPEERR, "can't supply flags whe
 MSG_DEF(JSMSG_NOTHING_TO_REPEAT,       0, JSEXN_SYNTAXERR, "nothing to repeat")
 MSG_DEF(JSMSG_NUMBERS_OUT_OF_ORDER,    0, JSEXN_SYNTAXERR, "numbers out of order in {} quantifier.")
 MSG_DEF(JSMSG_RANGE_WITH_CLASS_ESCAPE, 0, JSEXN_SYNTAXERR, "character class escape cannot be used in class range in regular expression")
+MSG_DEF(JSMSG_RAW_BRACE_IN_REGEP,      0, JSEXN_SYNTAXERR, "raw brace is not allowed in regular expression with unicode flag")
+MSG_DEF(JSMSG_RAW_BRACKET_IN_REGEP,    0, JSEXN_SYNTAXERR, "raw bracket is not allowed in regular expression with unicode flag")
 MSG_DEF(JSMSG_TOO_MANY_PARENS,         0, JSEXN_INTERNALERR, "too many parentheses in regular expression")
 MSG_DEF(JSMSG_UNICODE_OVERFLOW,        0, JSEXN_SYNTAXERR, "unicode codepoint should not be greater than 0x10FFFF in regular expression")
 MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN,   0, JSEXN_SYNTAXERR, "unmatched ) in regular expression")
diff --git a/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js b/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js
new file mode 100644
index 000000000..d1f775fac
--- /dev/null
+++ b/js/src/tests/ecma_6/RegExp/unicode-disallow-extended.js
@@ -0,0 +1,117 @@
+var BUGNUMBER = 1135377;
+var summary = "Implement RegExp unicode flag -- disallow extended patterns.";
+
+print(BUGNUMBER + ": " + summary);
+
+// IdentityEscape
+
+assertEqArray(/\^\$\\\.\*\+\?\(\)\[\]\{\}\|/u.exec("^$\\.*+?()[]{}|"),
+              ["^$\\.*+?()[]{}|"]);
+assertThrowsInstanceOf(() => eval(`/\\A/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\-/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\U{10}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\U0000/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\uD83D\\U0000/u`), SyntaxError);
+
+assertEqArray(/[\^\$\\\.\*\+\?\(\)\[\]\{\}\|]+/u.exec("^$\\.*+?()[]{}|"),
+              ["^$\\.*+?()[]{}|"]);
+assertThrowsInstanceOf(() => eval(`/[\\A]/u`), SyntaxError);
+assertEqArray(/[A\-Z]+/u.exec("a-zABC"),
+              ["-"]);
+assertThrowsInstanceOf(() => eval(`/[\\U{10}]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\U0000]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\uD83D\\U0000]/u`), SyntaxError);
+
+// PatternCharacter
+assertThrowsInstanceOf(() => eval(`/{}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{0}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{1,}/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/{1,2}/u`), SyntaxError);
+
+// QuantifiableAssertion
+assertEqArray(/.B(?=A)/u.exec("cBaCBA"),
+              ["CB"]);
+assertEqArray(/.B(?!A)/u.exec("CBAcBa"),
+              ["cB"]);
+assertEqArray(/.B(?:A)/u.exec("cBaCBA"),
+              ["CBA"]);
+assertEqArray(/.B(A)/u.exec("cBaCBA"),
+              ["CBA", "A"]);
+
+assertThrowsInstanceOf(() => eval(`/.B(?=A)+/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/.B(?!A)+/u`), SyntaxError);
+assertEqArray(/.B(?:A)+/u.exec("cBaCBA"),
+              ["CBA"]);
+assertEqArray(/.B(A)+/u.exec("cBaCBA"),
+              ["CBA", "A"]);
+
+// ControlLetter
+assertEqArray(/\cA/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/\cZ/u.exec("\u001a"),
+              ["\u001a"]);
+assertEqArray(/\ca/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/\cz/u.exec("\u001a"),
+              ["\u001a"]);
+
+assertEqArray(/[\cA]/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/[\cZ]/u.exec("\u001a"),
+              ["\u001a"]);
+assertEqArray(/[\ca]/u.exec("\u0001"),
+              ["\u0001"]);
+assertEqArray(/[\cz]/u.exec("\u001a"),
+              ["\u001a"]);
+
+assertThrowsInstanceOf(() => eval(`/\\c/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\c1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\c_/u`), SyntaxError);
+
+assertThrowsInstanceOf(() => eval(`/[\\c]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\c1]/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/[\\c_]/u`), SyntaxError);
+
+// HexEscapeSequence
+assertThrowsInstanceOf(() => eval(`/\\x/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x0/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\x1G/u`), SyntaxError);
+
+// LegacyOctalEscapeSequence
+assertThrowsInstanceOf(() => eval(`/\\52/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\052/u`), SyntaxError);
+
+// DecimalEscape
+assertEqArray(/\0/u.exec("\0"),
+              ["\0"]);
+assertEqArray(/[\0]/u.exec("\0"),
+              ["\0"]);
+assertEqArray(/\0A/u.exec("\0A"),
+              ["\0A"]);
+assertEqArray(/\0G/u.exec("\0G"),
+              ["\0G"]);
+assertEqArray(/(A.)\1/u.exec("ABACABAB"),
+              ["ABAB", "AB"]);
+assertEqArray(/(A.)(B.)(C.)(D.)(E.)(F.)(G.)(H.)(I.)(J.)(K.)\10/u.exec("A1B2C3D4E5F6G7H8I9JaKbJa"),
+              ["A1B2C3D4E5F6G7H8I9JaKbJa", "A1", "B2", "C3", "D4", "E5", "F6", "G7", "H8", "I9", "Ja", "Kb"]);
+
+assertThrowsInstanceOf(() => eval(`/\\00/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\01/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\09/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\1/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\2/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\3/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\4/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\5/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\6/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\7/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\8/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\9/u`), SyntaxError);
+assertThrowsInstanceOf(() => eval(`/\\10/u`), SyntaxError);
+
+if (typeof reportCompare === "function")
+    reportCompare(true, true);
diff --git a/js/src/vm/Xdr.h b/js/src/vm/Xdr.h
index b22324cdf..0b3262408 100644
--- a/js/src/vm/Xdr.h
+++ b/js/src/vm/Xdr.h
@@ -29,11 +29,11 @@ namespace js {
  *
  *  https://developer.mozilla.org/en-US/docs/SpiderMonkey/Internals/Bytecode
  */
-static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 334;
+static const uint32_t XDR_BYTECODE_VERSION_SUBTRAHEND = 335;
 static const uint32_t XDR_BYTECODE_VERSION =
     uint32_t(0xb973c0de - XDR_BYTECODE_VERSION_SUBTRAHEND);
 
-static_assert(JSErr_Limit == 429,
+static_assert(JSErr_Limit == 433,
               "GREETINGS, POTENTIAL SUBTRAHEND INCREMENTER! If you added or "
               "removed MSG_DEFs from js.msg, you should increment "
               "XDR_BYTECODE_VERSION_SUBTRAHEND and update this assertion's "