Refactor RegExp builtin (#3136)

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2019-10-02 16:55:16 +02:00
parent c3510fc03d
commit c3bb516e4a
20 changed files with 1415 additions and 1337 deletions
@@ -40,26 +40,26 @@
 * @return true - if lookup number of characters ahead are hex digits
 *         false - otherwise
 */
-static bool
+bool
 re_hex_lookup (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
               uint32_t lookup) /**< size of lookup */
 {
-  bool is_digit = true;
  const lit_utf8_byte_t *curr_p = parser_ctx_p->input_curr_p;

-  for (uint32_t i = 0; is_digit && i < lookup; i++)
+  if (JERRY_UNLIKELY (curr_p + lookup > parser_ctx_p->input_end_p))
  {
-    if (curr_p < parser_ctx_p->input_end_p)
-    {
-      is_digit = lit_char_is_hex_digit (*curr_p++);
-    }
-    else
+    return false;
+  }
+
+  for (uint32_t i = 0; i < lookup; i++)
+  {
+    if (!lit_char_is_hex_digit (*curr_p++))
    {
      return false;
    }
  }

-  return is_digit;
+  return true;
 } /* re_hex_lookup */

 /**
@@ -86,7 +86,7 @@ re_parse_non_greedy_char (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser cont
 *
 * @return uint32_t - parsed octal number
 */
-static uint32_t
+uint32_t
 re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
 {
  uint32_t number = 0;
@@ -110,7 +110,7 @@ re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
 *
 *         Returned value must be freed with ecma_free_value
 */
-static ecma_value_t
+ecma_value_t
 re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
                   re_token_t *re_token_p) /**< [out] output token */
 {
@@ -253,7 +253,7 @@ static void
 re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
 {
  int char_class_in = 0;
-  parser_ctx_p->num_of_groups = 0;
+  parser_ctx_p->groups_count = 0;
  const lit_utf8_byte_t *curr_p = parser_ctx_p->input_start_p;

  while (curr_p < parser_ctx_p->input_end_p)
@@ -287,7 +287,7 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex
            && *curr_p != LIT_CHAR_QUESTION
            && !char_class_in)
        {
-          parser_ctx_p->num_of_groups++;
+          parser_ctx_p->groups_count++;
        }
        break;
      }
@@ -295,264 +295,6 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex
  }
 } /* re_count_num_of_groups */

-/**
- * Read the input pattern and parse the range of character class
- *
- * @return empty ecma value - if parsed successfully
- *         error ecma value - otherwise
- *
- *         Returned value must be freed with ecma_free_value
- */
-ecma_value_t
-re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
-                     re_char_class_callback append_char_class, /**< callback function,
-                                                                *   which adds the char-ranges
-                                                                *   to the bytecode */
-                     void *re_ctx_p, /**< regexp compiler context */
-                     re_token_t *out_token_p) /**< [out] output token */
-{
-  re_token_type_t token_type = ((re_compiler_ctx_t *) re_ctx_p)->current_token.type;
-  out_token_p->qmax = out_token_p->qmin = 1;
-  ecma_char_t start = LIT_CHAR_UNDEF;
-  bool is_range = false;
-  parser_ctx_p->num_of_classes = 0;
-
-  const ecma_char_t prev_char = lit_utf8_peek_prev (parser_ctx_p->input_curr_p);
-  if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX)
-  {
-    lit_utf8_decr (&parser_ctx_p->input_curr_p);
-    lit_utf8_decr (&parser_ctx_p->input_curr_p);
-  }
-
-  do
-  {
-    if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
-    {
-      return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string"));
-    }
-
-    ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
-
-    if (ch == LIT_CHAR_RIGHT_SQUARE)
-    {
-      if (start != LIT_CHAR_UNDEF)
-      {
-        append_char_class (re_ctx_p, start, start);
-      }
-      break;
-    }
-    else if (ch == LIT_CHAR_MINUS)
-    {
-      if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
-      {
-        return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'"));
-      }
-
-      if (start != LIT_CHAR_UNDEF
-          && !is_range
-          && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE)
-      {
-        is_range = true;
-        continue;
-      }
-    }
-    else if (ch == LIT_CHAR_BACKSLASH)
-    {
-      if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p)
-      {
-        return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'"));
-      }
-
-      ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
-
-      if (ch == LIT_CHAR_LOWERCASE_B)
-      {
-        ch = LIT_CHAR_BS;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_F)
-      {
-        ch = LIT_CHAR_FF;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_N)
-      {
-        ch = LIT_CHAR_LF;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_T)
-      {
-        ch = LIT_CHAR_TAB;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_R)
-      {
-        ch = LIT_CHAR_CR;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_V)
-      {
-        ch = LIT_CHAR_VTAB;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_C)
-      {
-        if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
-        {
-          ch = *parser_ctx_p->input_curr_p;
-
-          if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
-              || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
-              || (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9))
-          {
-            /* See ECMA-262 v5, 15.10.2.10 (Point 3) */
-            ch = (ch % 32);
-            parser_ctx_p->input_curr_p++;
-          }
-          else
-          {
-            ch = LIT_CHAR_LOWERCASE_C;
-          }
-        }
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2))
-      {
-        ecma_char_t code_unit;
-
-        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
-        {
-          return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'"));
-        }
-
-        parser_ctx_p->input_curr_p += 2;
-        if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
-            && is_range == false
-            && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS)
-        {
-          start = code_unit;
-          continue;
-        }
-
-        ch = code_unit;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4))
-      {
-        ecma_char_t code_unit;
-
-        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
-        {
-          return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'"));
-        }
-
-        parser_ctx_p->input_curr_p += 4;
-        if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p
-            && is_range == false
-            && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS)
-        {
-          start = code_unit;
-          continue;
-        }
-
-        ch = code_unit;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_D)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (ch == LIT_CHAR_UPPERCASE_D)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_S)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR);
-        append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP);
-        append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP);
-        append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */
-        append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */
-        append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */
-        append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS);
-        append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */
-        append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */
-        append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */
-        append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (ch == LIT_CHAR_UPPERCASE_S)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL);
-        append_char_class (re_ctx_p, 0x1681UL, 0x180DUL);
-        append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL);
-        append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL);
-        append_char_class (re_ctx_p, 0x2030UL, 0x205EUL);
-        append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL);
-        append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (ch == LIT_CHAR_LOWERCASE_W)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9);
-        append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z);
-        append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE);
-        append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (ch == LIT_CHAR_UPPERCASE_W)
-      {
-        /* See ECMA-262 v5, 15.10.2.12 */
-        append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1);
-        append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = LIT_CHAR_UNDEF;
-      }
-      else if (lit_char_is_octal_digit ((ecma_char_t) ch)
-               && ch != LIT_CHAR_0)
-      {
-        lit_utf8_decr (&parser_ctx_p->input_curr_p);
-        ch = (ecma_char_t) re_parse_octal (parser_ctx_p);
-      }
-    } /* ch == LIT_CHAR_BACKSLASH */
-
-    if (start != LIT_CHAR_UNDEF)
-    {
-      if (is_range)
-      {
-        if (start > ch)
-        {
-          return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order"));
-        }
-        else
-        {
-          append_char_class (re_ctx_p, start, ch);
-          start = LIT_CHAR_UNDEF;
-          is_range = false;
-        }
-      }
-      else
-      {
-        append_char_class (re_ctx_p, start, start);
-        start = ch;
-      }
-    }
-    else
-    {
-      start = ch;
-    }
-  }
-  while (token_type == RE_TOK_START_CHAR_CLASS || token_type == RE_TOK_START_INV_CHAR_CLASS);
-
-  return re_parse_iterator (parser_ctx_p, out_token_p);
-} /* re_parse_char_class */
-
 /**
 * Read the input pattern and parse the next token for the RegExp compiler
 *
@@ -730,12 +472,12 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
        }
        else
        {
-          if (parser_ctx_p->num_of_groups == -1)
+          if (parser_ctx_p->groups_count == -1)
          {
            re_count_num_of_groups (parser_ctx_p);
          }

-          if (parser_ctx_p->num_of_groups)
+          if (parser_ctx_p->groups_count)
          {
            parser_ctx_p->input_curr_p--;
            uint32_t number = 0;
@@ -765,7 +507,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
            }
            while (true);

-            if ((int) number <= parser_ctx_p->num_of_groups)
+            if ((int) number <= parser_ctx_p->groups_count)
            {
              out_token_p->type = RE_TOK_BACKREFERENCE;
            }