Improve RegExp compatibility with web browsers (#3339)

The modification adds support to parse /A{/ like RegExps. That is: if the iterator is invalid it should be treated as normal character. This behaviour is defined in the ES2015 standard Annex B 1.4 point This only works if the `JERRY_REGEXP_STRICT_MODE` is disabled (set to zero). JerryScript-DCO-1.0-Signed-off-by: Peter Gal pgal.u-szeged@partner.samsung.com
2019-11-21 10:59:21 +01:00
parent 7f6f562adb
commit aadfa13c38
2 changed files with 222 additions and 14 deletions
@@ -630,11 +630,22 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
    {
      return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
    }
+    case LIT_CHAR_NULL:
+    {
+      out_token_p->type = RE_TOK_EOF;
+      break;
+    }
    case LIT_CHAR_LEFT_BRACE:
    {
 #if ENABLED (JERRY_REGEXP_STRICT_MODE)
      return ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp token."));
 #else /* !ENABLED (JERRY_REGEXP_STRICT_MODE) */
+
+      /* Make sure that the current '{' does not start an iterator.
+       *
+       * E.g: /\s+{3,4}/ should fail as there is nothing to iterate.
+       *     However /\s+{3,4/ should be valid in web compatibility mode.
+       */
      const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p;

      lit_utf8_decr (&parser_ctx_p->input_curr_p);
@@ -648,9 +659,25 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
      ecma_free_value (JERRY_CONTEXT (error_value));

      parser_ctx_p->input_curr_p = input_curr_p;
-
+      /* It was not an iterator, continue the parsing. */
+#endif /* ENABLED (JERRY_REGEXP_STRICT_MODE) */
+      /* FALLTHRU */
+    }
+    default:
+    {
      out_token_p->type = RE_TOK_CHAR;
      out_token_p->value = ch;
+#if ENABLED (JERRY_REGEXP_STRICT_MODE)
+      ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
+#else
+      /* In case of compatiblity mode try the following:
+       * 1. Try parsing an iterator after the character.
+       * 2.a. If no error is reported: it was an iterator so return an empty value.
+       * 2.b. If there was an error: it was not an iterator thus return the current position
+       *      to the start of the iterator parsing and set the return value to the empty value.
+       * 3. The next 're_parse_next_token' call will handle the further parsing of characters.
+       */
+      const lit_utf8_byte_t *input_curr_p = parser_ctx_p->input_curr_p;
      ret_value = re_parse_iterator (parser_ctx_p, out_token_p);

      if (!ecma_is_value_empty (ret_value))
@@ -659,19 +686,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
        parser_ctx_p->input_curr_p = input_curr_p;
        ret_value = ECMA_VALUE_EMPTY;
      }
-#endif /* ENABLED (JERRY_REGEXP_STRICT_MODE) */
-      break;
-    }
-    case LIT_CHAR_NULL:
-    {
-      out_token_p->type = RE_TOK_EOF;
-      break;
-    }
-    default:
-    {
-      out_token_p->type = RE_TOK_CHAR;
-      out_token_p->value = ch;
-      ret_value = re_parse_iterator (parser_ctx_p, out_token_p);
+#endif
      break;
    }
  }