Rework RegExp engine and add support for proper unicode matching (#3746)
This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
@@ -19,6 +19,7 @@
|
||||
#if ENABLED (JERRY_BUILTIN_REGEXP)
|
||||
|
||||
#include "ecma-globals.h"
|
||||
#include "re-compiler-context.h"
|
||||
|
||||
/** \addtogroup parser Parser
|
||||
* @{
|
||||
@@ -40,43 +41,57 @@
|
||||
*/
|
||||
#define RE_FLAGS_MASK 0x3F
|
||||
|
||||
/**
|
||||
* Maximum value that can be encoded in the RegExp bytecode as a single byte.
|
||||
*/
|
||||
#define RE_VALUE_1BYTE_MAX 0xFE
|
||||
|
||||
/**
|
||||
* Marker that signals that the actual value is enocded in the following 4 bytes in the bytecode.
|
||||
*/
|
||||
#define RE_VALUE_4BYTE_MARKER 0xFF
|
||||
|
||||
/**
|
||||
* RegExp opcodes
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
RE_OP_EOF,
|
||||
/* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
|
||||
* Change it carefully. Capture opcodes should be at first.
|
||||
*/
|
||||
RE_OP_CAPTURE_GROUP_START, /**< group start */
|
||||
RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */
|
||||
RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */
|
||||
RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */
|
||||
RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */
|
||||
RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */
|
||||
RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */
|
||||
RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */
|
||||
RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */
|
||||
RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */
|
||||
RE_OP_EOF, /**< end of pattern */
|
||||
|
||||
RE_OP_ALTERNATIVE_START, /**< start of alternatives */
|
||||
RE_OP_ALTERNATIVE_NEXT, /**< next alternative */
|
||||
RE_OP_NO_ALTERNATIVE, /**< no alternative */
|
||||
|
||||
RE_OP_CAPTURING_GROUP_START, /**< start of a capturing group */
|
||||
RE_OP_NON_CAPTURING_GROUP_START, /**< start of a non-capturing group */
|
||||
|
||||
RE_OP_GREEDY_CAPTURING_GROUP_END, /**< end of a greedy capturing group */
|
||||
RE_OP_GREEDY_NON_CAPTURING_GROUP_END, /**< end of a greedy non-capturing group */
|
||||
RE_OP_LAZY_CAPTURING_GROUP_END, /**< end of a lazy capturing group */
|
||||
RE_OP_LAZY_NON_CAPTURING_GROUP_END, /**< end of a lazy non-capturing group */
|
||||
|
||||
RE_OP_MATCH, /**< match */
|
||||
RE_OP_CHAR, /**< any character */
|
||||
RE_OP_SAVE_AT_START, /**< save at start */
|
||||
RE_OP_SAVE_AND_MATCH, /**< save and match */
|
||||
RE_OP_PERIOD, /**< "." */
|
||||
RE_OP_ALTERNATIVE, /**< "|" */
|
||||
RE_OP_GREEDY_ITERATOR, /**< greedy iterator */
|
||||
RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */
|
||||
RE_OP_ASSERT_START, /**< "^" */
|
||||
RE_OP_ASSERT_END, /**< "$" */
|
||||
RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */
|
||||
RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
|
||||
RE_OP_LOOKAHEAD_POS, /**< lookahead pos */
|
||||
RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */
|
||||
RE_OP_BACKREFERENCE, /**< "\[0..9]" */
|
||||
RE_OP_CHAR_CLASS, /**< "[ ]" */
|
||||
RE_OP_INV_CHAR_CLASS /**< "[^ ]" */
|
||||
RE_OP_LAZY_ITERATOR, /**< lazy iterator */
|
||||
RE_OP_ITERATOR_END, /*** end of an iterator */
|
||||
|
||||
RE_OP_BACKREFERENCE, /**< backreference */
|
||||
|
||||
RE_OP_ASSERT_LINE_START, /**< line start assertion */
|
||||
RE_OP_ASSERT_LINE_END, /**< line end assertion */
|
||||
RE_OP_ASSERT_WORD_BOUNDARY, /**< word boundary assertion */
|
||||
RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< not word boundary assertion */
|
||||
RE_OP_ASSERT_LOOKAHEAD_POS, /**< positive lookahead assertion */
|
||||
RE_OP_ASSERT_LOOKAHEAD_NEG, /**< negative lookahead assertion */
|
||||
RE_OP_ASSERT_END, /**< end of an assertion */
|
||||
|
||||
RE_OP_CLASS_ESCAPE, /**< class escape */
|
||||
RE_OP_CHAR_CLASS, /**< character class */
|
||||
#if ENABLED (JERRY_ES2015)
|
||||
RE_OP_UNICODE_PERIOD, /**< period in full unicode mode */
|
||||
#endif /* ENABLED (JERRY_ES2015) */
|
||||
RE_OP_PERIOD, /**< period in non-unicode mode */
|
||||
RE_OP_CHAR, /**< any code point */
|
||||
RE_OP_BYTE, /**< 1-byte utf8 character */
|
||||
} re_opcode_t;
|
||||
|
||||
/**
|
||||
@@ -85,42 +100,31 @@ typedef enum
|
||||
typedef struct
|
||||
{
|
||||
ecma_compiled_code_t header; /**< compiled code header */
|
||||
uint32_t captures_count; /**< number of capturing groups */
|
||||
uint32_t non_captures_count; /**< number of non-capturing groups */
|
||||
ecma_value_t source; /**< original RegExp pattern */
|
||||
uint32_t captures_count; /**< number of capturing brackets */
|
||||
uint32_t non_captures_count; /**< number of non capturing brackets */
|
||||
} re_compiled_code_t;
|
||||
|
||||
/**
|
||||
* Context of RegExp bytecode container
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint8_t *block_start_p; /**< start of bytecode block */
|
||||
uint8_t *block_end_p; /**< end of bytecode block */
|
||||
uint8_t *current_p; /**< current position in bytecode */
|
||||
} re_bytecode_ctx_t;
|
||||
void re_initialize_regexp_bytecode (re_compiler_ctx_t *re_ctx_p);
|
||||
uint32_t re_bytecode_size (re_compiler_ctx_t *re_ctx_p);
|
||||
|
||||
void re_append_opcode (re_compiler_ctx_t *re_ctx_p, const re_opcode_t opcode);
|
||||
void re_append_byte (re_compiler_ctx_t *re_ctx_p, const uint8_t byte);
|
||||
void re_append_char (re_compiler_ctx_t *re_ctx_p, const lit_code_point_t cp);
|
||||
void re_append_value (re_compiler_ctx_t *re_ctx_p, const uint32_t value);
|
||||
|
||||
void re_insert_opcode (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const re_opcode_t opcode);
|
||||
void re_insert_byte (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint8_t byte);
|
||||
void re_insert_char (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const lit_code_point_t cp);
|
||||
void re_insert_value (re_compiler_ctx_t *re_ctx_p, const uint32_t offset, const uint32_t value);
|
||||
|
||||
re_opcode_t re_get_opcode (const uint8_t **bc_p);
|
||||
ecma_char_t re_get_char (const uint8_t **bc_p);
|
||||
uint8_t re_get_byte (const uint8_t **bc_p);
|
||||
lit_code_point_t re_get_char (const uint8_t **bc_p, bool unicode);
|
||||
uint32_t re_get_value (const uint8_t **bc_p);
|
||||
uint32_t JERRY_ATTR_PURE re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p);
|
||||
|
||||
void re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p);
|
||||
|
||||
void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, const re_opcode_t opcode);
|
||||
void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t value);
|
||||
void re_append_char (re_bytecode_ctx_t *bc_ctx_p, const ecma_char_t input_char);
|
||||
void re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, uint32_t value);
|
||||
|
||||
void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const re_opcode_t opcode);
|
||||
void re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const uint32_t value);
|
||||
void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p,
|
||||
const size_t offset,
|
||||
const uint8_t *bytecode_p,
|
||||
const size_t length);
|
||||
|
||||
#if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE)
|
||||
void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
|
||||
void re_dump_bytecode (re_compiler_ctx_t *bc_ctx);
|
||||
#endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user