Rework RegExp engine and add support for proper unicode matching (#3746)

This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2020-05-26 15:28:54 +02:00
parent 908240ba62
commit 8f76a1f382
30 changed files with 3641 additions and 2647 deletions
@@ -18,8 +18,6 @@

 #include "lit-globals.h"

-#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */
-
 /*
 * Format control characters (ECMA-262 v5, Table 1)
 */
@@ -37,7 +35,7 @@
 #define LIT_CHAR_NBSP ((ecma_char_t) 0x00A0) /* no-break space */
 /* LIT_CHAR_BOM is defined above */

-bool lit_char_is_white_space (ecma_char_t c);
+bool lit_char_is_white_space (lit_code_point_t c);

 /*
 * Line terminator characters (ECMA-262 v5, Table 3)
@@ -219,10 +217,8 @@ uint32_t lit_char_hex_to_int (ecma_char_t c);
 size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point);
 size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point);
 void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p);
-
-/* read a hex encoded code point from a zero terminated buffer */
-bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters,
-                                  ecma_char_t *out_code_unit_p);
+uint32_t lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, const lit_utf8_byte_t *const buf_end_p, uint32_t lookup);
+uint32_t lit_parse_decimal (const lit_utf8_byte_t **buffer_p, const lit_utf8_byte_t *const buffer_end_p);

 /**
 * Null character
@@ -232,7 +228,7 @@ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t
 /*
 * Part of IsWordChar abstract operation (ECMA-262 v5, 15.10.2.6, step 3)
 */
-bool lit_char_is_word_char (ecma_char_t c);
+bool lit_char_is_word_char (lit_code_point_t c);

 /*
 * Utility functions for uppercasing / lowercasing