Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2020-05-26 15:28:54 +02:00
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
@@ -1467,7 +1467,7 @@ ecma_gc_run (void)

 #if ENABLED (JERRY_BUILTIN_REGEXP)
  /* Free RegExp bytecodes stored in cache */
-  re_cache_gc_run ();
+  re_cache_gc ();
 #endif /* ENABLED (JERRY_BUILTIN_REGEXP) */
 } /* ecma_gc_run */

@@ -2362,8 +2362,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
  {
    read_size = lit_read_code_unit_from_utf8 (current_p, &ch);

-    if (!lit_char_is_white_space (ch)
-        && !lit_char_is_line_terminator (ch))
+    if (!lit_char_is_white_space (ch))
    {
      nonws_start_p = current_p;
      break;
@@ -2378,8 +2377,7 @@ ecma_string_trim_helper (const lit_utf8_byte_t **utf8_str_p, /**< [in, out] curr
  {
    read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);

-    if (!lit_char_is_white_space (ch)
-        && !lit_char_is_line_terminator (ch))
+    if (!lit_char_is_white_space (ch))
    {
      break;
    }
@@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
      continue;
    }

-    ecma_char_t decoded_byte;
-
-    if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
+    uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
+    if (hex_value == UINT32_MAX)
    {
      return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
    }

+    ecma_char_t decoded_byte = (ecma_char_t) hex_value;
    input_char_p += URI_ENCODED_BYTE_SIZE;

    if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
    /* Input decode. */
    if (*input_char_p != '%')
    {
-      *output_char_p = *input_char_p;
-      output_char_p++;
-      input_char_p++;
+      *output_char_p++ = *input_char_p++;
      continue;
    }

-    ecma_char_t decoded_byte;
-
-    if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
+    uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
+    if (hex_value == UINT32_MAX)
    {
      ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
      break;
    }

+    ecma_char_t decoded_byte = (ecma_char_t) hex_value;
    input_char_p += URI_ENCODED_BYTE_SIZE;

    if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
        }
        else
        {
-          ecma_char_t chr;
+          hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);

-          if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
-              || ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
+          if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
          {
            is_valid = false;
            break;
          }

-          octets[i] = (lit_utf8_byte_t) chr;
          input_char_p += URI_ENCODED_BYTE_SIZE;
+          octets[i] = (lit_utf8_byte_t) hex_value;
        }
      }

@@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
        }
        case LIT_CHAR_LOWERCASE_U:
        {
-          if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
+          uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
+          if (hex_value == UINT32_MAX)
          {
            goto invalid_string;
          }

-          ecma_char_t code_unit;
-          if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
-          {
-            goto invalid_string;
-          }
-
-          ecma_stringbuilder_append_char (&result_builder, code_unit);
+          ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value);
          current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
          break;
        }
@@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */

      ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;

-      const re_compiled_code_t *bc_p = NULL;
-      ecma_value_t ret_value = re_compile_bytecode (&bc_p,
-                                                    ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
-                                                    RE_FLAG_EMPTY);
+      re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
+                                                      RE_FLAG_EMPTY);

-      JERRY_ASSERT (ecma_is_value_empty (ret_value));
+      JERRY_ASSERT (bc_p != NULL);

      ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);

@@ -44,30 +44,73 @@ typedef enum
 } ecma_regexp_flags_t;

 /**
- * Structure for storing capturing group results
+ * Class escapes
+ */
+typedef enum
+{
+  RE_ESCAPE__START,                   /**< escapes start */
+  RE_ESCAPE_DIGIT = RE_ESCAPE__START, /**< digit */
+  RE_ESCAPE_NOT_DIGIT,                /**< not digit */
+  RE_ESCAPE_WORD_CHAR,                /**< word char */
+  RE_ESCAPE_NOT_WORD_CHAR,            /**< not word char */
+  RE_ESCAPE_WHITESPACE,               /**< whitespace */
+  RE_ESCAPE_NOT_WHITESPACE,           /**< not whitespace */
+  RE_ESCAPE__COUNT,                   /**< escape count */
+} ecma_class_escape_t;
+
+/**
+ * Character class flags escape count mask size.
+ */
+#define RE_CLASS_ESCAPE_COUNT_MASK_SIZE (3u)
+
+/**
+ * Character class flags escape count mask.
+ */
+#define RE_CLASS_ESCAPE_COUNT_MASK ((1 << RE_CLASS_ESCAPE_COUNT_MASK_SIZE) - 1u)
+
+/**
+ * Character class flags that are present in the upper bits of the class flags byte, while the 3 least significant bits
+ * hold a value that contains the number of class escapes present in the character class.
+ */
+typedef enum
+{
+  RE_CLASS_HAS_CHARS = (1 << 5),    /**< contains individual characters */
+  RE_CLASS_HAS_RANGES = (1 << 6),   /**< contains character ranges */
+  RE_CLASS_INVERT = (1 << 7),       /**< inverted */
+} ecma_char_class_flags_t;
+
+/**
+ * Structure for matching capturing groups and storing their result
+ */
+typedef struct
+{
+  const lit_utf8_byte_t *begin_p; /**< capture start pointer */
+  const lit_utf8_byte_t *end_p;   /**< capture end pointer */
+  const uint8_t *bc_p;            /**< group bytecode pointer */
+  uint32_t iterator;              /**< iteration counter */
+  uint32_t subcapture_count;      /**< number of nested capturing groups */
+} ecma_regexp_capture_t;
+
+/**
+ * Structure for matching non-capturing groups
 */
 typedef struct
 {
  const lit_utf8_byte_t *begin_p; /**< substring start pointer */
-  const lit_utf8_byte_t *end_p;   /**< substring end pointer */
-} ecma_regexp_capture_t;
+  const uint8_t *bc_p;            /**< group bytecode pointer */
+  uint32_t iterator;              /**< iteration counter */
+  uint32_t subcapture_start;      /**< first nested capturing group index */
+  uint32_t subcapture_count;      /**< number of nested capturing groups */
+} ecma_regexp_non_capture_t;

 /**
 * Check if an ecma_regexp_capture_t contains a defined capture
 */
-#define ECMA_RE_IS_CAPTURE_DEFINED(c) ((c)->begin_p != NULL && (c)->end_p >= (c)->begin_p)
+#define ECMA_RE_IS_CAPTURE_DEFINED(c) ((c)->begin_p != NULL)

 ecma_value_t
 ecma_regexp_get_capture_value (const ecma_regexp_capture_t *const capture_p);

-/**
- * Structure for storing non-capturing group results
- */
-typedef struct
-{
-  const lit_utf8_byte_t *str_p; /**< string pointer */
-} ecma_regexp_non_capture_t;
-
 #if (JERRY_STACK_LIMIT != 0)
 /**
 * Value used ase result when stack limit is reached
@@ -82,27 +125,38 @@ typedef struct
 #define ECMA_RE_STACK_LIMIT_REACHED(p) (false)
 #endif /* JERRY_STACK_LIMIT != 0 */

+/**
+ * Offset applied to qmax when encoded into the bytecode.
+ *
+ * It's common for qmax to be Infinity, which is represented a UINT32_MAX. By applying the offset we are able to store
+ * it in a single byte az zero.
+ */
+#define RE_QMAX_OFFSET 1
+
 /**
 * RegExp executor context
 */
 typedef struct
 {
-  const lit_utf8_byte_t *input_end_p;          /**< end of input string */
  const lit_utf8_byte_t *input_start_p;        /**< start of input string */
+  const lit_utf8_byte_t *input_end_p;          /**< end of input string */
  uint32_t captures_count;                     /**< number of capture groups */
-  ecma_regexp_capture_t *captures_p;           /**< capturing groups */
  uint32_t non_captures_count;                 /**< number of non-capture groups */
+  ecma_regexp_capture_t *captures_p;           /**< capturing groups */
  ecma_regexp_non_capture_t *non_captures_p;   /**< non-capturing groups */
-  uint32_t *iterations_p;                      /**< number of iterations */
  uint16_t flags;                              /**< RegExp flags */
+  uint8_t char_size;                           /**< size of encoded characters */
 } ecma_regexp_ctx_t;

+#if ENABLED (JERRY_ES2015)
+lit_code_point_t ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, const lit_utf8_byte_t *end_p);
+#endif /* ENABLED (JERRY_ES2015) */
+
 ecma_object_t *ecma_op_regexp_alloc (ecma_object_t *new_target_obj_p);
 ecma_value_t ecma_regexp_exec_helper (ecma_object_t *regexp_object_p,
                                      ecma_string_t *input_string_p);
 ecma_string_t *ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg);
-lit_code_point_t ecma_regexp_canonicalize (lit_code_point_t ch, bool is_ignorecase);
-lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch);
+lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch, bool unicode);
 ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p);
 void ecma_regexp_create_and_initialize_props (ecma_object_t *re_object_p,
                                              ecma_string_t *source_p,