Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2020-05-26 15:28:54 +02:00
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
@@ -18,45 +18,18 @@

 #if ENABLED (JERRY_BUILTIN_REGEXP)

+#include "re-compiler-context.h"
+
 /** \addtogroup parser Parser
 * @{
 *
 * \addtogroup regexparser Regular expression
 * @{
 *
- * \addtogroup regexparser_bytecode Bytecode
+ * \addtogroup regexparser_parser Parser
 * @{
 */

-/**
- * RegExp token type definitions
- */
-typedef enum
-{
-  RE_TOK_EOF,                        /**< EOF */
-  RE_TOK_BACKREFERENCE,              /**< "\[0..9]" */
-  RE_TOK_CHAR,                       /**< any character */
-  RE_TOK_ALTERNATIVE,                /**< "|" */
-  RE_TOK_ASSERT_START,               /**< "^" */
-  RE_TOK_ASSERT_END,                 /**< "$" */
-  RE_TOK_PERIOD,                     /**< "." */
-  RE_TOK_START_CAPTURE_GROUP,        /**< "(" */
-  RE_TOK_START_NON_CAPTURE_GROUP,    /**< "(?:" */
-  RE_TOK_END_GROUP,                  /**< ")" */
-  RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< "(?=" */
-  RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< "(?!" */
-  RE_TOK_ASSERT_WORD_BOUNDARY,       /**< "\b" */
-  RE_TOK_ASSERT_NOT_WORD_BOUNDARY,   /**< "\B" */
-  RE_TOK_DIGIT,                      /**< "\d" */
-  RE_TOK_NOT_DIGIT,                  /**< "\D" */
-  RE_TOK_WHITE,                      /**< "\s" */
-  RE_TOK_NOT_WHITE,                  /**< "\S" */
-  RE_TOK_WORD_CHAR,                  /**< "\w" */
-  RE_TOK_NOT_WORD_CHAR,              /**< "\W" */
-  RE_TOK_START_CHAR_CLASS,           /**< "[ ]" */
-  RE_TOK_START_INV_CHAR_CLASS,       /**< "[^ ]" */
-} re_token_type_t;
-
 /**
 * @}
 *
@@ -65,43 +38,16 @@ typedef enum
 */

 /**
- * RegExp constant of infinite
+ * Value used for infinite quantifier.
 */
-#define RE_ITERATOR_INFINITE ((uint32_t) - 1)
+#define RE_INFINITY UINT32_MAX

 /**
- * Maximum number of decimal escape digits
+ * Maximum decimal value of an octal escape
 */
-#define RE_MAX_RE_DECESC_DIGITS 9
+#define RE_MAX_OCTAL_VALUE 0xff

-/**
- * RegExp token type
- */
-typedef struct
-{
-  re_token_type_t type;   /**< type of the token */
-  uint32_t value;         /**< value of the token */
-  uint32_t qmin;          /**< minimum number of token iterations */
-  uint32_t qmax;          /**< maximum number of token iterations */
-  bool greedy;            /**< type of iteration */
-} re_token_t;
-
-/**
-  * RegExp parser context
-  */
-typedef struct
-{
-  const lit_utf8_byte_t *input_start_p; /**< start of input pattern */
-  const lit_utf8_byte_t *input_curr_p;  /**< current position in input pattern */
-  const lit_utf8_byte_t *input_end_p;   /**< end of input pattern */
-  int groups_count;                     /**< number of groups */
-  uint32_t classes_count;               /**< number of character classes */
-} re_parser_ctx_t;
-
-bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, uint32_t lookup);
-uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p);
-ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, re_token_t *re_token_p);
-ecma_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p);
+ecma_value_t re_parse_alternative (re_compiler_ctx_t *re_ctx_p, bool expect_eof);

 /**
 * @}