Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2020-05-26 15:28:54 +02:00
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
@@ -103,31 +103,32 @@ search_char_in_interval_array (ecma_char_t c,               /**< code unit */
 } /* search_char_in_interval_array */

 /**
- * Check if specified character is one of the Whitespace characters including those
- * that fall into "Space, Separator" ("Zs") Unicode character category.
+ * Check if specified character is one of the Whitespace characters including those that fall into
+ * "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
 *
 * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
 *         false - otherwise
 */
 bool
-lit_char_is_white_space (ecma_char_t c) /**< code unit */
+lit_char_is_white_space (lit_code_point_t c) /**< code point */
 {
  if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
  {
-    return (c == LIT_CHAR_TAB
-            || c == LIT_CHAR_VTAB
-            || c == LIT_CHAR_FF
-            || c == LIT_CHAR_SP);
+    return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
  }
  else
  {
-    return (c == LIT_CHAR_NBSP
-            || c == LIT_CHAR_BOM
-            || (c >= lit_unicode_separator_char_interval_sps[0]
-                && c <= lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
-            || search_char_in_char_array (c,
-                                          lit_unicode_separator_chars,
-                                          NUM_OF_ELEMENTS (lit_unicode_separator_chars)));
+    if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
+    {
+      return true;
+    }
+
+    return (c <= LIT_UTF16_CODE_UNIT_MAX
+            && ((c >= lit_unicode_separator_char_interval_sps[0]
+                 && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
+                || search_char_in_char_array ((ecma_char_t) c,
+                                              lit_unicode_separator_chars,
+                                              NUM_OF_ELEMENTS (lit_unicode_separator_chars))));
  }
 } /* lit_char_is_white_space */

@@ -429,51 +430,72 @@ lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
 } /* lit_four_byte_utf8_char_to_cesu8 */

 /**
- * Parse the next number_of_characters hexadecimal character,
- * and construct a code unit from them. The buffer must
- * be zero terminated.
+ * Lookup hex digits in a buffer
 *
- * @return true if decoding was successful, false otherwise
+ * @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
+ *         value of hex number, otherwise
 */
-bool
-lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
-                             lit_utf8_size_t number_of_characters, /**< number of characters to be read */
-                             ecma_char_t *out_code_unit_p) /**< [out] decoded result */
+uint32_t
+lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
+                     const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
+                     uint32_t lookup) /**< size of lookup */
 {
-  ecma_char_t code_unit = LIT_CHAR_NULL;
+  JERRY_ASSERT (lookup <= 4);

-  JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);
-
-  for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
+  if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
  {
-    code_unit = (ecma_char_t) (code_unit << 4u);
-
-    if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
-        && *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
-    {
-      code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
-    }
-    else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
-             && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
-    {
-      code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
-    }
-    else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
-             && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
-    {
-      code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
-    }
-    else
-    {
-      return false;
-    }
-
-    buf_p++;
+    return UINT32_MAX;
  }

-  *out_code_unit_p = code_unit;
-  return true;
-} /* lit_read_code_unit_from_hex */
+  uint32_t value = 0;
+
+  while (lookup--)
+  {
+    lit_utf8_byte_t ch = *buf_p++;
+    if (!lit_char_is_hex_digit (ch))
+    {
+      return UINT32_MAX;
+    }
+
+    value <<= 4;
+    value += lit_char_hex_to_int (ch);
+  }
+
+  JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
+  return value;
+} /* lit_char_hex_lookup */
+
+/**
+ * Parse a decimal number with the value clamped to UINT32_MAX.
+ *
+ * @returns uint32_t number
+ */
+uint32_t
+lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
+                   const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
+{
+  const lit_utf8_byte_t *current_p = *buffer_p;
+  JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
+
+  uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
+
+  while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
+  {
+    const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
+    uint32_t new_value = value * 10 + digit;
+
+    if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
+    {
+      value = UINT32_MAX;
+      continue;
+    }
+
+    value = new_value;
+  }
+
+  *buffer_p = current_p;
+  return value;
+} /* lit_parse_decimal */

 /**
 * Check if specified character is a word character (part of IsWordChar abstract operation)
@@ -484,7 +506,7 @@ lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with char
 *         false - otherwise
 */
 bool
-lit_char_is_word_char (ecma_char_t c) /**< code unit */
+lit_char_is_word_char (lit_code_point_t c) /**< code point */
 {
  return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
          || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
@@ -18,8 +18,6 @@

 #include "lit-globals.h"

-#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */
-
 /*
 * Format control characters (ECMA-262 v5, Table 1)
 */
@@ -37,7 +35,7 @@
 #define LIT_CHAR_NBSP ((ecma_char_t) 0x00A0) /* no-break space */
 /* LIT_CHAR_BOM is defined above */

-bool lit_char_is_white_space (ecma_char_t c);
+bool lit_char_is_white_space (lit_code_point_t c);

 /*
 * Line terminator characters (ECMA-262 v5, Table 3)
@@ -219,10 +217,8 @@ uint32_t lit_char_hex_to_int (ecma_char_t c);
 size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point);
 size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point);
 void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p);
-
-/* read a hex encoded code point from a zero terminated buffer */
-bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters,
-                                  ecma_char_t *out_code_unit_p);
+uint32_t lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, const lit_utf8_byte_t *const buf_end_p, uint32_t lookup);
+uint32_t lit_parse_decimal (const lit_utf8_byte_t **buffer_p, const lit_utf8_byte_t *const buffer_end_p);

 /**
 * Null character
@@ -232,7 +228,7 @@ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t
 /*
 * Part of IsWordChar abstract operation (ECMA-262 v5, 15.10.2.6, step 3)
 */
-bool lit_char_is_word_char (ecma_char_t c);
+bool lit_char_is_word_char (lit_code_point_t c);

 /*
 * Utility functions for uppercasing / lowercasing
@@ -513,7 +513,7 @@ lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with ch
 *
 * @return next code unit
 */
-ecma_char_t
+ecma_char_t JERRY_ATTR_NOINLINE
 lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
 {
  JERRY_ASSERT (buf_p != NULL);
@@ -529,7 +529,7 @@ lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha
 *
 * @return previous code unit
 */
-ecma_char_t
+ecma_char_t JERRY_ATTR_NOINLINE
 lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
 {
  JERRY_ASSERT (buf_p != NULL);
@@ -543,7 +543,7 @@ lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with cha
 /**
 * Increase cesu-8 encoded string pointer by one code unit.
 */
-void
+inline void JERRY_ATTR_ALWAYS_INLINE
 lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
 {
  JERRY_ASSERT (*buf_p);