Rework RegExp engine and add support for proper unicode matching (#3746)
This change includes several bugfixes, general improvements, and support for additional features. - Added full support for web compatibility syntax defined in Annex B - Implemented parsing and matching patterns in unicode mode - Fixed capture results when iterating with nested capturing groups - Significantly reduced regexp bytecode size - Reduced stack usage during regexp execution - Improved matching performance JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
@@ -223,13 +223,13 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
|
||||
continue;
|
||||
}
|
||||
|
||||
ecma_char_t decoded_byte;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
|
||||
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
|
||||
if (hex_value == UINT32_MAX)
|
||||
{
|
||||
return ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
|
||||
}
|
||||
|
||||
ecma_char_t decoded_byte = (ecma_char_t) hex_value;
|
||||
input_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
|
||||
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
@@ -272,20 +272,18 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
|
||||
/* Input decode. */
|
||||
if (*input_char_p != '%')
|
||||
{
|
||||
*output_char_p = *input_char_p;
|
||||
output_char_p++;
|
||||
input_char_p++;
|
||||
*output_char_p++ = *input_char_p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ecma_char_t decoded_byte;
|
||||
|
||||
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
|
||||
uint32_t hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
|
||||
if (hex_value == UINT32_MAX)
|
||||
{
|
||||
ret_value = ecma_raise_uri_error (ECMA_ERR_MSG ("Invalid hexadecimal value."));
|
||||
break;
|
||||
}
|
||||
|
||||
ecma_char_t decoded_byte = (ecma_char_t) hex_value;
|
||||
input_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
|
||||
if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
@@ -337,17 +335,16 @@ ecma_builtin_global_object_decode_uri_helper (lit_utf8_byte_t *input_start_p, /*
|
||||
}
|
||||
else
|
||||
{
|
||||
ecma_char_t chr;
|
||||
hex_value = lit_char_hex_lookup (input_char_p + 1, input_end_p, 2);
|
||||
|
||||
if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
|
||||
|| ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
|
||||
if (hex_value == UINT32_MAX || (hex_value & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
||||
{
|
||||
is_valid = false;
|
||||
break;
|
||||
}
|
||||
|
||||
octets[i] = (lit_utf8_byte_t) chr;
|
||||
input_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
octets[i] = (lit_utf8_byte_t) hex_value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -174,18 +174,13 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
|
||||
}
|
||||
case LIT_CHAR_LOWERCASE_U:
|
||||
{
|
||||
if ((end_p - current_p <= ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH))
|
||||
uint32_t hex_value = lit_char_hex_lookup (current_p + 1, end_p, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH);
|
||||
if (hex_value == UINT32_MAX)
|
||||
{
|
||||
goto invalid_string;
|
||||
}
|
||||
|
||||
ecma_char_t code_unit;
|
||||
if (!(lit_read_code_unit_from_hex (current_p + 1, ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH, &code_unit)))
|
||||
{
|
||||
goto invalid_string;
|
||||
}
|
||||
|
||||
ecma_stringbuilder_append_char (&result_builder, code_unit);
|
||||
ecma_stringbuilder_append_char (&result_builder, (ecma_char_t) hex_value);
|
||||
current_p += ECMA_JSON_HEX_ESCAPE_SEQUENCE_LENGTH + 1;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -505,12 +505,10 @@ ecma_instantiate_builtin (ecma_builtin_id_t obj_builtin_id) /**< built-in id */
|
||||
|
||||
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;
|
||||
|
||||
const re_compiled_code_t *bc_p = NULL;
|
||||
ecma_value_t ret_value = re_compile_bytecode (&bc_p,
|
||||
ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
|
||||
RE_FLAG_EMPTY);
|
||||
re_compiled_code_t *bc_p = re_compile_bytecode (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP),
|
||||
RE_FLAG_EMPTY);
|
||||
|
||||
JERRY_ASSERT (ecma_is_value_empty (ret_value));
|
||||
JERRY_ASSERT (bc_p != NULL);
|
||||
|
||||
ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user