Implement \u{hex} support. (#3447)
A large rework because surrogate pairs must be combined. Currently only the 0x10C80..0x10CF2 is accepted as valid identifier character from the non-basic plane. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
This commit is contained in:
committed by
Dániel Bátyai
parent
1db16c3a1c
commit
40d930d62c
@@ -461,16 +461,9 @@ ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string
|
||||
if ((string_p[pos] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
|
||||
{
|
||||
/* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */
|
||||
uint32_t character = ((((uint32_t) string_p[pos++]) & 0x7) << 18);
|
||||
character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
|
||||
character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
|
||||
character |= (((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
|
||||
JERRY_ASSERT (character >= 0x10000);
|
||||
character -= 0x10000;
|
||||
|
||||
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xd800 | (character >> 10)));
|
||||
data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK)));
|
||||
lit_four_byte_utf8_char_to_cesu8 (data_p, string_p + pos);
|
||||
data_p += 3 * 2;
|
||||
pos += 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2683,10 +2676,10 @@ void
|
||||
ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, /**< string builder */
|
||||
const ecma_char_t c) /**< ecma char */
|
||||
{
|
||||
const lit_utf8_size_t size = (lit_utf8_size_t) lit_char_get_utf8_length (c);
|
||||
const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (c);
|
||||
lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size);
|
||||
|
||||
lit_char_to_utf8_bytes (dest_p, c);
|
||||
lit_code_point_to_cesu8_bytes (dest_p, c);
|
||||
} /* ecma_stringbuilder_append_char */
|
||||
|
||||
/**
|
||||
|
||||
@@ -61,7 +61,7 @@ ecma_date_parse_date_chars (const lit_utf8_byte_t **str_p, /**< pointer to the c
|
||||
|
||||
while (num_of_chars--)
|
||||
{
|
||||
if (*str_p >= str_end_p || !lit_char_is_decimal_digit (lit_utf8_read_next (str_p)))
|
||||
if (*str_p >= str_end_p || !lit_char_is_decimal_digit (lit_cesu8_read_next (str_p)))
|
||||
{
|
||||
return ecma_number_make_nan ();
|
||||
}
|
||||
|
||||
@@ -150,7 +150,7 @@ ecma_builtin_global_object_parse_int (const lit_utf8_byte_t *string_buff, /**< r
|
||||
int sign = 1;
|
||||
|
||||
/* 4. */
|
||||
ecma_char_t current = lit_utf8_read_next (&string_curr_p);
|
||||
ecma_char_t current = lit_cesu8_read_next (&string_curr_p);
|
||||
if (current == LIT_CHAR_MINUS)
|
||||
{
|
||||
sign = -1;
|
||||
@@ -162,7 +162,7 @@ ecma_builtin_global_object_parse_int (const lit_utf8_byte_t *string_buff, /**< r
|
||||
start_p = string_curr_p;
|
||||
if (string_curr_p < string_end_p)
|
||||
{
|
||||
current = lit_utf8_read_next (&string_curr_p);
|
||||
current = lit_cesu8_read_next (&string_curr_p);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -970,7 +970,7 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine'
|
||||
|
||||
while (input_curr_p < input_end_p)
|
||||
{
|
||||
ecma_char_t chr = lit_utf8_read_next (&input_curr_p);
|
||||
ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
|
||||
|
||||
if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
{
|
||||
@@ -1005,7 +1005,7 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine'
|
||||
|
||||
while (input_curr_p < input_end_p)
|
||||
{
|
||||
ecma_char_t chr = lit_utf8_read_next (&input_curr_p);
|
||||
ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
|
||||
|
||||
if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
{
|
||||
@@ -1091,7 +1091,7 @@ ecma_builtin_global_object_unescape (lit_utf8_byte_t *input_start_p, /**< routin
|
||||
while (input_curr_p < input_end_p)
|
||||
{
|
||||
/* 6. */
|
||||
ecma_char_t chr = lit_utf8_read_next (&input_curr_p);
|
||||
ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
|
||||
|
||||
/* 7-8. */
|
||||
if (status == 0 && chr == LIT_CHAR_PERCENT)
|
||||
|
||||
@@ -713,7 +713,7 @@ ecma_builtin_helper_string_find_index (ecma_string_t *original_str_p, /**< index
|
||||
|
||||
/* iterate original string and try to match at each position */
|
||||
bool searching = true;
|
||||
ecma_char_t first_char = lit_utf8_read_next (&search_str_curr_p);
|
||||
ecma_char_t first_char = lit_cesu8_read_next (&search_str_curr_p);
|
||||
while (searching)
|
||||
{
|
||||
/* match as long as possible */
|
||||
@@ -722,14 +722,14 @@ ecma_builtin_helper_string_find_index (ecma_string_t *original_str_p, /**< index
|
||||
|
||||
if (match_len < search_len &&
|
||||
index + match_len < original_len &&
|
||||
lit_utf8_read_next (&original_str_curr_p) == first_char)
|
||||
lit_cesu8_read_next (&original_str_curr_p) == first_char)
|
||||
{
|
||||
const lit_utf8_byte_t *nested_search_str_curr_p = search_str_curr_p;
|
||||
match_len++;
|
||||
|
||||
while (match_len < search_len &&
|
||||
index + match_len < original_len &&
|
||||
lit_utf8_read_next (&original_str_curr_p) == lit_utf8_read_next (&nested_search_str_curr_p))
|
||||
lit_cesu8_read_next (&original_str_curr_p) == lit_cesu8_read_next (&nested_search_str_curr_p))
|
||||
{
|
||||
match_len++;
|
||||
}
|
||||
|
||||
@@ -1155,7 +1155,7 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str
|
||||
|
||||
while (input_str_curr_p < input_str_end_p)
|
||||
{
|
||||
ecma_char_t character = lit_utf8_read_next (&input_str_curr_p);
|
||||
ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p);
|
||||
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
|
||||
ecma_length_t character_length;
|
||||
lit_utf8_byte_t utf8_byte_buffer[LIT_CESU8_MAX_BYTES_IN_CODE_POINT];
|
||||
@@ -1194,7 +1194,7 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str
|
||||
|
||||
while (input_str_curr_p < input_str_end_p)
|
||||
{
|
||||
ecma_char_t character = lit_utf8_read_next (&input_str_curr_p);
|
||||
ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p);
|
||||
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
|
||||
ecma_length_t character_length;
|
||||
|
||||
|
||||
@@ -220,11 +220,11 @@ ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to st
|
||||
JERRY_ASSERT (str_p != NULL);
|
||||
const lit_utf8_byte_t *current_p = *str_p;
|
||||
|
||||
lit_code_point_t ch = lit_utf8_read_next (¤t_p);
|
||||
lit_code_point_t ch = lit_cesu8_read_next (¤t_p);
|
||||
if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch)
|
||||
&& current_p < end_p)
|
||||
{
|
||||
const ecma_char_t next_ch = lit_utf8_peek_next (current_p);
|
||||
const ecma_char_t next_ch = lit_cesu8_peek_next (current_p);
|
||||
if (lit_is_code_point_utf16_low_surrogate (next_ch))
|
||||
{
|
||||
lit_utf8_incr (¤t_p);
|
||||
@@ -425,14 +425,14 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
|
||||
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
|
||||
lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */
|
||||
lit_code_point_t ch2 = lit_utf8_read_next (&str_curr_p);
|
||||
lit_code_point_t ch2 = lit_cesu8_read_next (&str_curr_p);
|
||||
|
||||
#if ENABLED (JERRY_ES2015)
|
||||
if (re_ctx_p->flags & RE_FLAG_UNICODE
|
||||
&& lit_is_code_point_utf16_high_surrogate (ch2)
|
||||
&& str_curr_p < re_ctx_p->input_end_p)
|
||||
{
|
||||
const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p);
|
||||
const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p);
|
||||
if (lit_is_code_point_utf16_low_surrogate (next_ch))
|
||||
{
|
||||
lit_utf8_incr (&str_curr_p);
|
||||
@@ -460,7 +460,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
return NULL; /* fail */
|
||||
}
|
||||
|
||||
const ecma_char_t ch = lit_utf8_read_next (&str_curr_p);
|
||||
const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p);
|
||||
JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch);
|
||||
|
||||
if (lit_char_is_line_terminator (ch))
|
||||
@@ -474,7 +474,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
&& lit_is_code_point_utf16_high_surrogate (ch)
|
||||
&& str_curr_p < re_ctx_p->input_end_p)
|
||||
{
|
||||
const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p);
|
||||
const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p);
|
||||
if (lit_is_code_point_utf16_low_surrogate (next_ch))
|
||||
{
|
||||
lit_utf8_incr (&str_curr_p);
|
||||
@@ -501,7 +501,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
return NULL; /* fail */
|
||||
}
|
||||
|
||||
if (lit_char_is_line_terminator (lit_utf8_peek_prev (str_curr_p)))
|
||||
if (lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p)))
|
||||
{
|
||||
JERRY_TRACE_MSG ("match\n");
|
||||
break; /* tail merge */
|
||||
@@ -526,7 +526,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
return NULL; /* fail */
|
||||
}
|
||||
|
||||
if (lit_char_is_line_terminator (lit_utf8_peek_next (str_curr_p)))
|
||||
if (lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p)))
|
||||
{
|
||||
JERRY_TRACE_MSG ("match\n");
|
||||
break; /* tail merge */
|
||||
@@ -539,10 +539,10 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
|
||||
{
|
||||
const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p)
|
||||
&& lit_char_is_word_char (lit_utf8_peek_prev (str_curr_p)));
|
||||
&& lit_char_is_word_char (lit_cesu8_peek_prev (str_curr_p)));
|
||||
|
||||
const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p)
|
||||
&& lit_char_is_word_char (lit_utf8_peek_next (str_curr_p)));
|
||||
&& lit_char_is_word_char (lit_cesu8_peek_next (str_curr_p)));
|
||||
|
||||
if (op == RE_OP_ASSERT_WORD_BOUNDARY)
|
||||
{
|
||||
@@ -659,7 +659,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
else
|
||||
{
|
||||
#endif /* ENABLED (JERRY_ES2015) */
|
||||
const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p),
|
||||
const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_cesu8_read_next (&str_curr_p),
|
||||
is_ignorecase);
|
||||
|
||||
while (range_count-- > 0)
|
||||
@@ -1115,7 +1115,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
|
||||
break;
|
||||
}
|
||||
|
||||
lit_utf8_read_prev (&str_curr_p);
|
||||
lit_cesu8_read_prev (&str_curr_p);
|
||||
iter_count--;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user