Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support
for additional features.
- Added full support for web compatibility syntax defined in Annex B
- Implemented parsing and matching patterns in unicode mode
- Fixed capture results when iterating with nested capturing groups
- Significantly reduced regexp bytecode size
- Reduced stack usage during regexp execution
- Improved matching performance

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
Dániel Bátyai
2020-05-26 15:28:54 +02:00
committed by GitHub
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
+4 -8
View File
@@ -18,8 +18,6 @@
#include "lit-globals.h"
#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */
/*
* Format control characters (ECMA-262 v5, Table 1)
*/
@@ -37,7 +35,7 @@
#define LIT_CHAR_NBSP ((ecma_char_t) 0x00A0) /* no-break space */
/* LIT_CHAR_BOM is defined above */
bool lit_char_is_white_space (ecma_char_t c);
bool lit_char_is_white_space (lit_code_point_t c);
/*
* Line terminator characters (ECMA-262 v5, Table 3)
@@ -219,10 +217,8 @@ uint32_t lit_char_hex_to_int (ecma_char_t c);
size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point);
size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point);
void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p);
/* read a hex encoded code point from a zero terminated buffer */
bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters,
ecma_char_t *out_code_unit_p);
uint32_t lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, const lit_utf8_byte_t *const buf_end_p, uint32_t lookup);
uint32_t lit_parse_decimal (const lit_utf8_byte_t **buffer_p, const lit_utf8_byte_t *const buffer_end_p);
/**
* Null character
@@ -232,7 +228,7 @@ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t
/*
* Part of IsWordChar abstract operation (ECMA-262 v5, 15.10.2.6, step 3)
*/
bool lit_char_is_word_char (ecma_char_t c);
bool lit_char_is_word_char (lit_code_point_t c);
/*
* Utility functions for uppercasing / lowercasing