Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support
for additional features.
- Added full support for web compatibility syntax defined in Annex B
- Implemented parsing and matching patterns in unicode mode
- Fixed capture results when iterating with nested capturing groups
- Significantly reduced regexp bytecode size
- Reduced stack usage during regexp execution
- Improved matching performance

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
Dániel Bátyai
2020-05-26 15:28:54 +02:00
committed by GitHub
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
+8 -62
View File
@@ -18,45 +18,18 @@
#if ENABLED (JERRY_BUILTIN_REGEXP)
#include "re-compiler-context.h"
/** \addtogroup parser Parser
* @{
*
* \addtogroup regexparser Regular expression
* @{
*
* \addtogroup regexparser_bytecode Bytecode
* \addtogroup regexparser_parser Parser
* @{
*/
/**
* RegExp token type definitions
*/
typedef enum
{
RE_TOK_EOF, /**< EOF */
RE_TOK_BACKREFERENCE, /**< "\[0..9]" */
RE_TOK_CHAR, /**< any character */
RE_TOK_ALTERNATIVE, /**< "|" */
RE_TOK_ASSERT_START, /**< "^" */
RE_TOK_ASSERT_END, /**< "$" */
RE_TOK_PERIOD, /**< "." */
RE_TOK_START_CAPTURE_GROUP, /**< "(" */
RE_TOK_START_NON_CAPTURE_GROUP, /**< "(?:" */
RE_TOK_END_GROUP, /**< ")" */
RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< "(?=" */
RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< "(?!" */
RE_TOK_ASSERT_WORD_BOUNDARY, /**< "\b" */
RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
RE_TOK_DIGIT, /**< "\d" */
RE_TOK_NOT_DIGIT, /**< "\D" */
RE_TOK_WHITE, /**< "\s" */
RE_TOK_NOT_WHITE, /**< "\S" */
RE_TOK_WORD_CHAR, /**< "\w" */
RE_TOK_NOT_WORD_CHAR, /**< "\W" */
RE_TOK_START_CHAR_CLASS, /**< "[ ]" */
RE_TOK_START_INV_CHAR_CLASS, /**< "[^ ]" */
} re_token_type_t;
/**
* @}
*
@@ -65,43 +38,16 @@ typedef enum
*/
/**
* RegExp constant of infinite
* Value used for infinite quantifier.
*/
#define RE_ITERATOR_INFINITE ((uint32_t) - 1)
#define RE_INFINITY UINT32_MAX
/**
* Maximum number of decimal escape digits
* Maximum decimal value of an octal escape
*/
#define RE_MAX_RE_DECESC_DIGITS 9
#define RE_MAX_OCTAL_VALUE 0xff
/**
* RegExp token type
*/
typedef struct
{
re_token_type_t type; /**< type of the token */
uint32_t value; /**< value of the token */
uint32_t qmin; /**< minimum number of token iterations */
uint32_t qmax; /**< maximum number of token iterations */
bool greedy; /**< type of iteration */
} re_token_t;
/**
* RegExp parser context
*/
typedef struct
{
const lit_utf8_byte_t *input_start_p; /**< start of input pattern */
const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */
const lit_utf8_byte_t *input_end_p; /**< end of input pattern */
int groups_count; /**< number of groups */
uint32_t classes_count; /**< number of character classes */
} re_parser_ctx_t;
bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, uint32_t lookup);
uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p);
ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, re_token_t *re_token_p);
ecma_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p);
ecma_value_t re_parse_alternative (re_compiler_ctx_t *re_ctx_p, bool expect_eof);
/**
* @}