Support Unicode supplementary planes (#3928)

JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
This commit is contained in:
Robert Fancsik
2020-07-06 14:21:13 +02:00
committed by GitHub
parent 7353b253ab
commit c1e90da0b4
16 changed files with 1105 additions and 861 deletions
+14 -4
View File
@@ -2605,6 +2605,19 @@ ecma_stringbuilder_append_raw (ecma_stringbuilder_t *builder_p, /**< string buil
memcpy (dest_p, data_p, data_size);
} /* ecma_stringbuilder_append_raw */
/**
* Append a codepoint to a string builder
*/
void
ecma_stringbuilder_append_codepoint (ecma_stringbuilder_t *builder_p, /**< string builder */
lit_code_point_t cp) /**< code point */
{
const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (cp);
lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size);
lit_code_point_to_cesu8_bytes (dest_p, cp);
} /* ecma_stringbuilder_append_codepoint */
/**
* Append an ecma_char_t to a string builder
*/
@@ -2612,10 +2625,7 @@ void
ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, /**< string builder */
const ecma_char_t c) /**< ecma char */
{
const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (c);
lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size);
lit_code_point_to_cesu8_bytes (dest_p, c);
ecma_stringbuilder_append_codepoint (builder_p, c);
} /* ecma_stringbuilder_append_char */
/**
+1
View File
@@ -393,6 +393,7 @@ void ecma_stringbuilder_append_magic (ecma_stringbuilder_t *builder_p, const lit
void ecma_stringbuilder_append_raw (ecma_stringbuilder_t *builder_p,
const lit_utf8_byte_t *data_p,
const lit_utf8_size_t data_size);
void ecma_stringbuilder_append_codepoint (ecma_stringbuilder_t *builder_p, lit_code_point_t cp);
void ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, const ecma_char_t c);
void ecma_stringbuilder_append_byte (ecma_stringbuilder_t *builder_p, const lit_utf8_byte_t);
ecma_string_t *ecma_stringbuilder_finalize (ecma_stringbuilder_t *builder_p);
@@ -988,96 +988,42 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str
bool lower_case) /**< convert to lower (true)
* or upper (false) case */
{
ecma_value_t ret_value = ECMA_VALUE_EMPTY;
ecma_stringbuilder_t builder = ecma_stringbuilder_create ();
/* 3. */
ECMA_STRING_TO_UTF8_STRING (input_string_p, input_start_p, input_start_size);
/*
* The URI encoding has two major phases: first we compute
* the length of the lower case string, then we encode it.
*/
lit_utf8_size_t output_length = 0;
const lit_utf8_byte_t *input_str_curr_p = input_start_p;
const lit_utf8_byte_t *input_curr_p = input_start_p;
const lit_utf8_byte_t *input_str_end_p = input_start_p + input_start_size;
while (input_str_curr_p < input_str_end_p)
while (input_curr_p < input_str_end_p)
{
ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p);
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
ecma_length_t character_length;
lit_utf8_byte_t utf8_byte_buffer[LIT_CESU8_MAX_BYTES_IN_CODE_POINT];
lit_code_point_t cp = lit_cesu8_read_next (&input_curr_p);
#if ENABLED (JERRY_ESNEXT)
if (lit_is_code_point_utf16_high_surrogate (cp))
{
const ecma_char_t next_ch = lit_cesu8_peek_next (input_curr_p);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
cp = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) cp, next_ch);
input_curr_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
}
}
#endif /* ENABLED (JERRY_ESNEXT) */
if (lower_case)
{
character_length = lit_char_to_lower_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
lit_char_to_lower_case (cp, &builder);
}
else
{
character_length = lit_char_to_upper_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}
JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH);
for (ecma_length_t i = 0; i < character_length; i++)
{
output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer);
lit_char_to_upper_case (cp, &builder);
}
}
/* Second phase. */
JMEM_DEFINE_LOCAL_ARRAY (output_start_p,
output_length,
lit_utf8_byte_t);
lit_utf8_byte_t *output_char_p = output_start_p;
/* Encoding the output. */
input_str_curr_p = input_start_p;
while (input_str_curr_p < input_str_end_p)
{
ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p);
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
ecma_length_t character_length;
if (lower_case)
{
character_length = lit_char_to_lower_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}
else
{
character_length = lit_char_to_upper_case (character,
character_buffer,
LIT_MAXIMUM_OTHER_CASE_LENGTH);
}
JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH);
for (ecma_length_t i = 0; i < character_length; i++)
{
output_char_p += lit_code_unit_to_utf8 (character_buffer[i], output_char_p);
}
}
JERRY_ASSERT (output_start_p + output_length == output_char_p);
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length);
ret_value = ecma_make_string_value (output_string_p);
JMEM_FINALIZE_LOCAL_ARRAY (output_start_p);
ECMA_FINALIZE_UTF8_STRING (input_start_p, input_start_size);
return ret_value;
return ecma_make_string_value (ecma_stringbuilder_finalize (&builder));
} /* ecma_builtin_string_prototype_object_conversion_helper */
/**
@@ -413,23 +413,13 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */
return ch;
}
#if ENABLED (JERRY_ESNEXT)
/* TODO: Implement case folding for code points in the upper planes. */
if (JERRY_UNLIKELY (ch > LIT_UTF16_CODE_UNIT_MAX))
{
return ch;
}
#endif /* ENABLED (JERRY_ESNEXT) */
lit_code_point_t cu = lit_char_to_upper_case (ch, NULL);
ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH];
const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH);
if (size != 1)
if (cu == LIT_MULTIPLE_CU)
{
return ch;
}
const ecma_char_t cu = u[0];
if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode)
{
/* 6. */