From c1e90da0b4830109e61013dac09412c9387b20be Mon Sep 17 00:00:00 2001 From: Robert Fancsik Date: Mon, 6 Jul 2020 14:21:13 +0200 Subject: [PATCH] Support Unicode supplementary planes (#3928) JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu --- jerry-core/ecma/base/ecma-helpers-string.c | 18 +- jerry-core/ecma/base/ecma-helpers.h | 1 + .../ecma-builtin-string-prototype.c | 92 +-- .../ecma/operations/ecma-regexp-object.c | 14 +- jerry-core/lit/lit-char-helpers.c | 602 +++++++++------- jerry-core/lit/lit-char-helpers.h | 19 +- .../lit/lit-unicode-conversions-sup.inc.h | 30 + jerry-core/lit/lit-unicode-conversions.inc.h | 28 +- jerry-core/lit/lit-unicode-ranges-sup.inc.h | 129 ++++ jerry-core/lit/lit-unicode-ranges.inc.h | 213 +++--- .../string-upper-lower-case-conversion.js | 66 ++ .../es.next/unicode-escape-identifiers.js | 32 + .../string-upper-lower-case-conversion.js | 20 + .../string-upper-lower-case-conversion.js | 7 - tests/test262-es6-excludelist.xml | 16 +- tools/gen-unicode.py | 679 ++++++++---------- 16 files changed, 1105 insertions(+), 861 deletions(-) create mode 100644 jerry-core/lit/lit-unicode-conversions-sup.inc.h create mode 100644 jerry-core/lit/lit-unicode-ranges-sup.inc.h create mode 100644 tests/jerry/es.next/string-upper-lower-case-conversion.js create mode 100644 tests/jerry/es.next/unicode-escape-identifiers.js create mode 100644 tests/jerry/es5.1/string-upper-lower-case-conversion.js diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 457fd7fe9..144a20a8a 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -2605,6 +2605,19 @@ ecma_stringbuilder_append_raw (ecma_stringbuilder_t *builder_p, /**< string buil memcpy (dest_p, data_p, data_size); } /* ecma_stringbuilder_append_raw */ +/** + * Append a codepoint to a string builder + */ +void +ecma_stringbuilder_append_codepoint (ecma_stringbuilder_t *builder_p, /**< string builder */ + lit_code_point_t cp) /**< code point */ +{ + const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (cp); + lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size); + + lit_code_point_to_cesu8_bytes (dest_p, cp); +} /* ecma_stringbuilder_append_codepoint */ + /** * Append an ecma_char_t to a string builder */ @@ -2612,10 +2625,7 @@ void ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, /**< string builder */ const ecma_char_t c) /**< ecma char */ { - const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (c); - lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size); - - lit_code_point_to_cesu8_bytes (dest_p, c); + ecma_stringbuilder_append_codepoint (builder_p, c); } /* ecma_stringbuilder_append_char */ /** diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index 100e9911d..82d4017a3 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -393,6 +393,7 @@ void ecma_stringbuilder_append_magic (ecma_stringbuilder_t *builder_p, const lit void ecma_stringbuilder_append_raw (ecma_stringbuilder_t *builder_p, const lit_utf8_byte_t *data_p, const lit_utf8_size_t data_size); +void ecma_stringbuilder_append_codepoint (ecma_stringbuilder_t *builder_p, lit_code_point_t cp); void ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, const ecma_char_t c); void ecma_stringbuilder_append_byte (ecma_stringbuilder_t *builder_p, const lit_utf8_byte_t); ecma_string_t *ecma_stringbuilder_finalize (ecma_stringbuilder_t *builder_p); diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c index c0b73ce3f..4f54da141 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c @@ -988,96 +988,42 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str bool lower_case) /**< convert to lower (true) * or upper (false) case */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; + ecma_stringbuilder_t builder = ecma_stringbuilder_create (); - /* 3. */ ECMA_STRING_TO_UTF8_STRING (input_string_p, input_start_p, input_start_size); - /* - * The URI encoding has two major phases: first we compute - * the length of the lower case string, then we encode it. - */ - - lit_utf8_size_t output_length = 0; - const lit_utf8_byte_t *input_str_curr_p = input_start_p; + const lit_utf8_byte_t *input_curr_p = input_start_p; const lit_utf8_byte_t *input_str_end_p = input_start_p + input_start_size; - while (input_str_curr_p < input_str_end_p) + while (input_curr_p < input_str_end_p) { - ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p); - ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; - ecma_length_t character_length; - lit_utf8_byte_t utf8_byte_buffer[LIT_CESU8_MAX_BYTES_IN_CODE_POINT]; + lit_code_point_t cp = lit_cesu8_read_next (&input_curr_p); + +#if ENABLED (JERRY_ESNEXT) + if (lit_is_code_point_utf16_high_surrogate (cp)) + { + const ecma_char_t next_ch = lit_cesu8_peek_next (input_curr_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + cp = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) cp, next_ch); + input_curr_p += LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; + } + } +#endif /* ENABLED (JERRY_ESNEXT) */ if (lower_case) { - character_length = lit_char_to_lower_case (character, - character_buffer, - LIT_MAXIMUM_OTHER_CASE_LENGTH); + lit_char_to_lower_case (cp, &builder); } else { - character_length = lit_char_to_upper_case (character, - character_buffer, - LIT_MAXIMUM_OTHER_CASE_LENGTH); - } - - JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); - - for (ecma_length_t i = 0; i < character_length; i++) - { - output_length += lit_code_unit_to_utf8 (character_buffer[i], utf8_byte_buffer); + lit_char_to_upper_case (cp, &builder); } } - /* Second phase. */ - - JMEM_DEFINE_LOCAL_ARRAY (output_start_p, - output_length, - lit_utf8_byte_t); - - lit_utf8_byte_t *output_char_p = output_start_p; - - /* Encoding the output. */ - input_str_curr_p = input_start_p; - - while (input_str_curr_p < input_str_end_p) - { - ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p); - ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; - ecma_length_t character_length; - - if (lower_case) - { - character_length = lit_char_to_lower_case (character, - character_buffer, - LIT_MAXIMUM_OTHER_CASE_LENGTH); - } - else - { - character_length = lit_char_to_upper_case (character, - character_buffer, - LIT_MAXIMUM_OTHER_CASE_LENGTH); - } - - JERRY_ASSERT (character_length >= 1 && character_length <= LIT_MAXIMUM_OTHER_CASE_LENGTH); - - for (ecma_length_t i = 0; i < character_length; i++) - { - output_char_p += lit_code_unit_to_utf8 (character_buffer[i], output_char_p); - } - } - - JERRY_ASSERT (output_start_p + output_length == output_char_p); - - ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_length); - - ret_value = ecma_make_string_value (output_string_p); - - JMEM_FINALIZE_LOCAL_ARRAY (output_start_p); ECMA_FINALIZE_UTF8_STRING (input_start_p, input_start_size); - return ret_value; + return ecma_make_string_value (ecma_stringbuilder_finalize (&builder)); } /* ecma_builtin_string_prototype_object_conversion_helper */ /** diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index 7e8b35737..b279b608b 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -413,23 +413,13 @@ ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */ return ch; } -#if ENABLED (JERRY_ESNEXT) - /* TODO: Implement case folding for code points in the upper planes. */ - if (JERRY_UNLIKELY (ch > LIT_UTF16_CODE_UNIT_MAX)) - { - return ch; - } -#endif /* ENABLED (JERRY_ESNEXT) */ + lit_code_point_t cu = lit_char_to_upper_case (ch, NULL); - ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH]; - const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); - - if (size != 1) + if (cu == LIT_MULTIPLE_CU) { return ch; } - const ecma_char_t cu = u[0]; if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode) { /* 6. */ diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index fd733eed1..6ecab2d13 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -14,12 +14,15 @@ */ #include "config.h" +#include "ecma-helpers.h" #include "lit-char-helpers.h" #include "lit-unicode-ranges.inc.h" +#include "lit-unicode-ranges-sup.inc.h" #include "lit-strings.h" #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) #include "lit-unicode-conversions.inc.h" +#include "lit-unicode-conversions-sup.inc.h" #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ #define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0])) @@ -31,36 +34,43 @@ * @return true - if the character is in the given array * false - otherwise */ -static bool -search_char_in_char_array (ecma_char_t c, /**< code unit */ - const ecma_char_t *array, /**< array */ - int size_of_array) /**< length of the array */ -{ - int bottom = 0; - int top = size_of_array - 1; +#define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \ +static bool \ +function_name (char_type c, /**< code unit */ \ + const array_type *array, /**< array */ \ + int size_of_array) /**< length of the array */\ +{ \ + int bottom = 0; \ + int top = size_of_array - 1; \ + \ + while (bottom <= top) \ + { \ + int middle = (bottom + top) / 2; \ + char_type current = array[middle]; \ + \ + if (current == c) \ + { \ + return true; \ + } \ + \ + if (c < current) \ + { \ + top = middle - 1; \ + } \ + else \ + { \ + bottom = middle + 1; \ + } \ + } \ + \ + return false; \ +} /* __function_name */ - while (bottom <= top) - { - int middle = (bottom + top) / 2; - ecma_char_t current = array[middle]; +LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t) - if (current == c) - { - return true; - } - - if (c < current) - { - top = middle - 1; - } - else - { - bottom = middle + 1; - } - } - - return false; -} /* search_char_in_char_array */ +#if ENABLED (JERRY_ESNEXT) +LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t) +#endif /* ENABLED (JERRY_ESNEXT) */ /** * Binary search algorithm that searches a character in the given intervals. @@ -70,37 +80,44 @@ search_char_in_char_array (ecma_char_t c, /**< code unit */ * @return true - if the the character is included (inclusively) in one of the intervals in the given array * false - otherwise */ -static bool -search_char_in_interval_array (ecma_char_t c, /**< code unit */ - const ecma_char_t *array_sp, /**< array of interval starting points */ - const uint8_t *lengths, /**< array of interval lengths */ - int size_of_array) /**< length of the array */ -{ - int bottom = 0; - int top = size_of_array - 1; +#define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \ +static bool \ +function_name (char_type c, /**< code unit */ \ + const array_type *array_sp, /**< array of interval starting points */ \ + const interval_type *lengths, /**< array of interval lengths */ \ + int size_of_array) /**< length of the array */ \ +{ \ + int bottom = 0; \ + int top = size_of_array - 1; \ + \ + while (bottom <= top) \ + { \ + int middle = (bottom + top) / 2; \ + char_type current_sp = array_sp[middle]; \ + \ + if (current_sp <= c && c <= current_sp + lengths[middle]) \ + { \ + return true; \ + } \ + \ + if (c > current_sp) \ + { \ + bottom = middle + 1; \ + } \ + else \ + { \ + top = middle - 1; \ + } \ + } \ + \ + return false; \ +} /* function_name */ - while (bottom <= top) - { - int middle = (bottom + top) / 2; - ecma_char_t current_sp = array_sp[middle]; +LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t) - if (current_sp <= c && c <= current_sp + lengths[middle]) - { - return true; - } - - if (c > current_sp) - { - bottom = middle + 1; - } - else - { - top = middle - 1; - } - } - - return false; -} /* search_char_in_interval_array */ +#if ENABLED (JERRY_ESNEXT) +LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t) +#endif /* ENABLED (JERRY_ESNEXT) */ /** * Check if specified character is one of the Whitespace characters including those that fall into @@ -116,20 +133,18 @@ lit_char_is_white_space (lit_code_point_t c) /**< code point */ { return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR)); } - else - { - if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS) - { - return true; - } - return (c <= LIT_UTF16_CODE_UNIT_MAX - && ((c >= lit_unicode_separator_char_interval_sps[0] - && c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0]) - || search_char_in_char_array ((ecma_char_t) c, - lit_unicode_separator_chars, - NUM_OF_ELEMENTS (lit_unicode_separator_chars)))); + if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS) + { + return true; } + + return (c <= LIT_UTF16_CODE_UNIT_MAX + && ((c >= lit_unicode_white_space_interval_starts[0] + && c < lit_unicode_white_space_interval_starts[0] + lit_unicode_white_space_interval_lengths[0]) + || lit_search_char_in_array ((ecma_char_t) c, + lit_unicode_white_space_chars, + NUM_OF_ELEMENTS (lit_unicode_white_space_chars)))); } /* lit_char_is_white_space */ /** @@ -148,58 +163,84 @@ lit_char_is_line_terminator (ecma_char_t c) /**< code unit */ } /* lit_char_is_line_terminator */ /** - * Check if specified character is a unicode letter - * - * Note: - * Unicode letter is a character, included into one of the following categories: - * - Uppercase letter (Lu); - * - Lowercase letter (Ll); - * - Titlecase letter (Lt); - * - Modifier letter (Lm); - * - Other letter (Lo); - * - Letter number (Nl). + * Check if specified character is a Unicode ID_Start * * See also: - * ECMA-262 v5, 7.6 + * ECMA-262 v1, 11.6: UnicodeIDStart * - * @return true - if specified character falls into one of the listed categories, + * @return true - if the codepoint has Unicode property "ID_Start" * false - otherwise */ static bool -lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */ +lit_char_is_unicode_id_start (lit_code_point_t code_point) /**< code unit */ { - return (search_char_in_interval_array (c, - lit_unicode_letter_interval_sps, - lit_unicode_letter_interval_lengths, - NUM_OF_ELEMENTS (lit_unicode_letter_interval_sps)) - || search_char_in_char_array (c, lit_unicode_letter_chars, NUM_OF_ELEMENTS (lit_unicode_letter_chars))); -} /* lit_char_is_unicode_letter */ +#if ENABLED (JERRY_ESNEXT) + if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)) + { + return (lit_search_codepoint_in_interval_array (code_point, + lit_unicode_id_start_interval_starts_sup, + lit_unicode_id_start_interval_lengths_sup, + NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup)) + || lit_search_codepoint_in_array (code_point, + lit_unicode_id_start_chars_sup, + NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup))); + } +#else /* !ENABLED (JERRY_ESNEXT) */ + JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN); +#endif /* ENABLED (JERRY_ESNEXT) */ + + ecma_char_t c = (ecma_char_t) code_point; + + return (lit_search_char_in_interval_array (c, + lit_unicode_id_start_interval_starts, + lit_unicode_id_start_interval_lengths, + NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts)) + || lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars))); +} /* lit_char_is_unicode_id_start */ /** - * Check if specified character is a non-letter character and can be used as a - * non-first character of an identifier. - * These characters coverd by the following unicode categories: - * - digit (Nd) - * - punctuation mark (Mn, Mc) - * - connector punctuation (Pc) + * Check if specified character is a Unicode ID_Continue * * See also: - * ECMA-262 v5, 7.6 + * ECMA-262 v1, 11.6: UnicodeIDContinue * - * @return true - if specified character falls into one of the listed categories, + * @return true - if the codepoint has Unicode property "ID_Continue" * false - otherwise */ static bool -lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */ +lit_char_is_unicode_id_continue (lit_code_point_t code_point) /**< code unit */ { - return (search_char_in_interval_array (c, - lit_unicode_non_letter_ident_part_interval_sps, - lit_unicode_non_letter_ident_part_interval_lengths, - NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_interval_sps)) - || search_char_in_char_array (c, - lit_unicode_non_letter_ident_part_chars, - NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars))); -} /* lit_char_is_unicode_non_letter_ident_part */ + /* Each ID_Start codepoint is ID_Continue as well. */ + if (lit_char_is_unicode_id_start (code_point)) + { + return true; + } + +#if ENABLED (JERRY_ESNEXT) + if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)) + { + return (lit_search_codepoint_in_interval_array (code_point, + lit_unicode_id_continue_interval_starts_sup, + lit_unicode_id_continue_interval_lengths_sup, + NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup)) + || lit_search_codepoint_in_array (code_point, + lit_unicode_id_continue_chars_sup, + NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup))); + } +#else /* !ENABLED (JERRY_ESNEXT) */ + JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN); +#endif /* ENABLED (JERRY_ESNEXT) */ + + ecma_char_t c = (ecma_char_t) code_point; + + return (lit_search_char_in_interval_array (c, + lit_unicode_id_continue_interval_starts, + lit_unicode_id_continue_interval_lengths, + NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts)) + || lit_search_char_in_array (c, + lit_unicode_id_continue_chars, + NUM_OF_ELEMENTS (lit_unicode_id_continue_chars))); +} /* lit_char_is_unicode_id_continue */ /** * Checks whether the character is a valid identifier start. @@ -218,17 +259,7 @@ lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point || code_point == LIT_CHAR_UNDERSCORE); } -#if ENABLED (JERRY_ESNEXT) - if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN) - { - /* TODO: detect these ranges correctly. */ - return (code_point >= 0x10C80 && code_point <= 0x10CF2); - } -#else /* !ENABLED (JERRY_ESNEXT) */ - JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN); -#endif /* ENABLED (JERRY_ESNEXT) */ - - return lit_char_is_unicode_letter ((ecma_char_t) code_point); + return lit_char_is_unicode_id_start (code_point); } /* lit_code_point_is_identifier_start */ /** @@ -249,18 +280,7 @@ lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point || code_point == LIT_CHAR_UNDERSCORE); } -#if ENABLED (JERRY_ESNEXT) - if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN) - { - /* TODO: detect these ranges correctly. */ - return (code_point >= 0x10C80 && code_point <= 0x10CF2); - } -#else /* !ENABLED (JERRY_ESNEXT) */ - JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN); -#endif /* ENABLED (JERRY_ESNEXT) */ - - return (lit_char_is_unicode_letter ((ecma_char_t) code_point) - || lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point)); + return lit_char_is_unicode_id_continue (code_point); } /* lit_code_point_is_identifier_part */ /** @@ -519,16 +539,27 @@ lit_char_is_word_char (lit_code_point_t c) /**< code point */ /** * Check if the specified character is in one of those tables which contain bidirectional conversions. * - * @return the mapped character sequence of an ecma character, if it's in the table. - * 0 - otherwise. + * @return codepoint of the converted character if it is found the the tables + * LIT_INVALID_CP - otherwise. */ -static ecma_length_t -search_in_bidirectional_conversion_tables (ecma_char_t character, /**< code unit */ - ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */ - bool is_lowercase) /**< is lowercase conversion */ +static lit_code_point_t +lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /**< code point */ + bool is_lowercase) /**< is lowercase conversion */ { - /* 1, Check if the specified character is part of the lit_character_case_ranges table. */ - int number_of_case_ranges = NUM_OF_ELEMENTS (lit_character_case_ranges); + /* 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. */ + int number_of_case_ranges; +#if ENABLED (JERRY_ESNEXT) + bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX; + if (is_supplementary) + { + number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup); + } + else +#endif /* ENABLED (JERRY_ESNEXT) */ + { + number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges); + } + int conv_counter = 0; for (int i = 0; i < number_of_case_ranges; i++) @@ -538,54 +569,92 @@ search_in_bidirectional_conversion_tables (ecma_char_t character, /**< co conv_counter++; } - int range_length = lit_character_case_range_lengths[conv_counter]; - ecma_char_t start_point = lit_character_case_ranges[i]; + size_t range_length; + lit_code_point_t start_point; +#if ENABLED (JERRY_ESNEXT) + if (is_supplementary) + { + range_length = lit_unicode_character_case_range_lengths_sup[conv_counter]; + start_point = lit_unicode_character_case_ranges_sup[i]; + } + else +#endif /* ENABLED (JERRY_ESNEXT) */ + { + range_length = lit_unicode_character_case_range_lengths[conv_counter]; + start_point = lit_unicode_character_case_ranges[i]; + } - if (start_point > character || character >= start_point + range_length) + if (start_point > cp || cp >= start_point + range_length) { continue; } - int char_dist = character - start_point; - + uint32_t char_dist = (uint32_t) cp - start_point; + int offset; if (i % 2 == 0) { - output_buffer_p[0] = is_lowercase ? (ecma_char_t) (lit_character_case_ranges[i + 1] + char_dist) : character; + if (!is_lowercase) + { + return cp; + } + + offset = i + 1; } else { - output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (lit_character_case_ranges[i - 1] + char_dist); + if (is_lowercase) + { + return cp; + } + + offset = i - 1; } - return 1; +#if ENABLED (JERRY_ESNEXT) + if (is_supplementary) + { + start_point = lit_unicode_character_case_ranges_sup[offset]; + } + else +#endif /* ENABLED (JERRY_ESNEXT) */ + { + start_point = lit_unicode_character_case_ranges[offset]; + } + + return (lit_code_point_t) (start_point + char_dist); } + /* Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are + defined for supplementary planes */ +#if ENABLED (JERRY_ESNEXT) + if (is_supplementary) + { + return cp; + } +#endif /* ENABLED (JERRY_ESNEXT) */ + /* 2, Check if the specified character is part of the character_pair_ranges table. */ int bottom = 0; - int top = NUM_OF_ELEMENTS (lit_character_pair_ranges) - 1; + int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - 1; while (bottom <= top) { int middle = (bottom + top) / 2; - ecma_char_t current_sp = lit_character_pair_ranges[middle]; + lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle]; - if (current_sp <= character && character < current_sp + lit_character_pair_range_lengths[middle]) + if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle]) { - int char_dist = character - current_sp; + uint32_t char_dist = (uint32_t) (cp - current_sp); - if ((character - current_sp) % 2 == 0) + if ((cp - current_sp) % 2 == 0) { - output_buffer_p[0] = is_lowercase ? (ecma_char_t) (current_sp + char_dist + 1) : character; - } - else - { - output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (current_sp + char_dist - 1); + return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + 1) : cp; } - return 1; + return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - 1); } - if (character > current_sp) + if (cp > current_sp) { bottom = middle + 1; } @@ -596,41 +665,37 @@ search_in_bidirectional_conversion_tables (ecma_char_t character, /**< co } /* 3, Check if the specified character is part of the character_pairs table. */ - int number_of_character_pairs = NUM_OF_ELEMENTS (lit_character_pairs); + int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs); for (int i = 0; i < number_of_character_pairs; i++) { - if (character != lit_character_pairs[i]) + if (cp != lit_unicode_character_pairs[i]) { continue; } if (i % 2 == 0) { - output_buffer_p[0] = is_lowercase ? lit_character_pairs[i + 1] : character; - } - else - { - output_buffer_p[0] = is_lowercase ? character : lit_character_pairs[i - 1]; + return is_lowercase ? lit_unicode_character_pairs[i + 1] : cp; } - return 1; + return is_lowercase ? cp : lit_unicode_character_pairs[i - 1]; } - return 0; -} /* search_in_bidirectional_conversion_tables */ + return LIT_INVALID_CP; +} /* lit_search_in_bidirectional_conversion_tables */ /** * Check if the specified character is in the given conversion table. * - * @return the mapped character sequence of an ecma character, if it's in the table. - * 0 - otherwise. + * @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit + * converted code point - otherwise */ -static ecma_length_t -search_in_conversion_table (ecma_char_t character, /**< code unit */ - ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */ - const ecma_char_t *array, /**< array */ - const uint8_t *counters) /**< case_values counter */ +static lit_code_point_t +lit_search_in_conversion_table (ecma_char_t character, /**< code unit */ + ecma_stringbuilder_t *builder_p, /**< string builder */ + const ecma_char_t *array, /**< array */ + const uint8_t *counters) /**< case_values counter */ { int end_point = 0; @@ -653,28 +718,21 @@ search_in_conversion_table (ecma_char_t character, /**< code unit */ if (current == character) { - ecma_length_t char_sequence = 1; - - switch (size_of_case_value) + if (builder_p != NULL) { - case 3: + ecma_stringbuilder_append_char (builder_p, array[middle + 1]); + + if (size_of_case_value > 1) { - output_buffer_p[2] = array[middle + 3]; - char_sequence++; - /* FALLTHRU */ + ecma_stringbuilder_append_char (builder_p, array[middle + 2]); } - case 2: + if (size_of_case_value > 2) { - output_buffer_p[1] = array[middle + 2]; - char_sequence++; - /* FALLTHRU */ - } - default: - { - output_buffer_p[0] = array[middle + 1]; - return char_sequence; + ecma_stringbuilder_append_char (builder_p, array[middle + 3]); } } + + return size_of_case_value == 1 ? array[middle + 1]: LIT_MULTIPLE_CU; } if (character < current) @@ -688,127 +746,151 @@ search_in_conversion_table (ecma_char_t character, /**< code unit */ } } - return 0; -} /* search_in_conversion_table */ + if (builder_p != NULL) + { + ecma_stringbuilder_append_char (builder_p, character); + } + + return (lit_code_point_t) character; +} /* lit_search_in_conversion_table */ #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ /** - * Returns the lowercase character sequence of an ecma character. + * Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present. * - * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters. - * - * @return the length of the lowercase character sequence - * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH. + * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit + * converted code point - otherwise */ -ecma_length_t -lit_char_to_lower_case (ecma_char_t character, /**< input character value */ - ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */ - ecma_length_t buffer_size) /**< buffer size */ +lit_code_point_t +lit_char_to_lower_case (lit_code_point_t cp, /**< code point */ + ecma_stringbuilder_t *builder_p) /**< string builder */ { - JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH); - - if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z) + if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z) { - output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); - return 1; + lit_utf8_byte_t lowercase_char = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + + if (builder_p != NULL) + { + ecma_stringbuilder_append_byte (builder_p, lowercase_char); + } + + return lowercase_char; } #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) + lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true); - ecma_length_t lowercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, true); - - if (lowercase_sequence != 0) + if (lowercase_cp != LIT_INVALID_CP) { - return lowercase_sequence; + if (builder_p != NULL) + { + ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp); + } + + return lowercase_cp; } - int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_lower_case_ranges); + JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN); + + int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges); for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++) { - int range_length = lit_lower_case_range_lengths[j] - 1; - ecma_char_t start_point = lit_lower_case_ranges[i]; + JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > 0); + uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - 1); + lit_code_point_t start_point = lit_unicode_lower_case_ranges[i]; - if (start_point <= character && character <= start_point + range_length) + if (start_point <= cp && cp <= start_point + range_length) { - output_buffer_p[0] = (ecma_char_t) (lit_lower_case_ranges[i + 1] + (character - start_point)); - return 1; + lowercase_cp = lit_unicode_lower_case_ranges[i + 1] + (cp - start_point); + if (builder_p != NULL) + { + ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp); + } + + return lowercase_cp; } } - lowercase_sequence = search_in_conversion_table (character, - output_buffer_p, - lit_lower_case_conversions, - lit_lower_case_conversion_counters); - - if (lowercase_sequence != 0) + return lit_search_in_conversion_table ((ecma_char_t) cp, + builder_p, + lit_unicode_lower_case_conversions, + lit_unicode_lower_case_conversion_counters); +#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ + if (builder_p != NULL) { - return lowercase_sequence; + ecma_stringbuilder_append_codepoint (builder_p, cp); } + return cp; #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ - - output_buffer_p[0] = character; - return 1; } /* lit_char_to_lower_case */ /** - * Returns the uppercase character sequence of an ecma character. + * Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present. * - * Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters. - * - * @return the length of the uppercase character sequence - * which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH. + * @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit + * converted code point - otherwise */ -ecma_length_t -lit_char_to_upper_case (ecma_char_t character, /**< input character value */ - ecma_char_t *output_buffer_p, /**< buffer for the result characters */ - ecma_length_t buffer_size) /**< buffer size */ +lit_code_point_t +lit_char_to_upper_case (lit_code_point_t cp, /**< code point */ + ecma_stringbuilder_t *builder_p) /**< string builder */ { - JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH); - - if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z) + if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z) { - output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); - return 1; + lit_utf8_byte_t uppercase_char = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + + if (builder_p != NULL) + { + ecma_stringbuilder_append_byte (builder_p, uppercase_char); + } + + return uppercase_char; } #if ENABLED (JERRY_UNICODE_CASE_CONVERSION) + lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false); - ecma_length_t uppercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, false); - - if (uppercase_sequence != 0) + if (uppercase_cp != LIT_INVALID_CP) { - return uppercase_sequence; + if (builder_p != NULL) + { + ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp); + } + + return uppercase_cp; } - int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_upper_case_special_ranges); + int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges); for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++) { - int range_length = lit_upper_case_special_range_lengths[j]; - ecma_char_t start_point = lit_upper_case_special_ranges[i]; + uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j]; + ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i]; - if (start_point <= character && character <= start_point + range_length) + if (start_point <= cp && cp <= start_point + range_length) { - output_buffer_p[0] = (ecma_char_t) (lit_upper_case_special_ranges[i + 1] + (character - start_point)); - output_buffer_p[1] = (ecma_char_t) (lit_upper_case_special_ranges[i + 2]); - return 2; + if (builder_p != NULL) + { + uppercase_cp = lit_unicode_upper_case_special_ranges[i + 1] + (cp - start_point); + ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp); + ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + 2]); + } + + return LIT_MULTIPLE_CU; } } - uppercase_sequence = search_in_conversion_table (character, - output_buffer_p, - lit_upper_case_conversions, - lit_upper_case_conversion_counters); - - if (uppercase_sequence != 0) + return lit_search_in_conversion_table ((ecma_char_t) cp, + builder_p, + lit_unicode_upper_case_conversions, + lit_unicode_upper_case_conversion_counters); +#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ + if (builder_p != NULL) { - return uppercase_sequence; + ecma_stringbuilder_append_codepoint (builder_p, cp); } + return cp; #endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */ - - output_buffer_p[0] = character; - return 1; } /* lit_char_to_upper_case */ diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index 315b8f8da..d3a6dc444 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -18,6 +18,16 @@ #include "lit-globals.h" +/** + * Invalid character code point + */ +#define LIT_INVALID_CP 0xFFFFFFFF + +/** + * Result of lit_char_to_lower_case/lit_char_to_upper_case consist more than of a single code unit + */ +#define LIT_MULTIPLE_CU 0xFFFFFFFE + /* * Format control characters (ECMA-262 v5, Table 1) */ @@ -234,12 +244,7 @@ bool lit_char_is_word_char (lit_code_point_t c); * Utility functions for uppercasing / lowercasing */ -/** - * Minimum buffer size for lit_char_to_lower_case / lit_char_to_upper_case functions. - */ -#define LIT_MAXIMUM_OTHER_CASE_LENGTH (3) - -ecma_length_t lit_char_to_lower_case (ecma_char_t character, ecma_char_t *output_buffer_p, ecma_length_t buffer_size); -ecma_length_t lit_char_to_upper_case (ecma_char_t character, ecma_char_t *output_buffer_p, ecma_length_t buffer_size); +lit_code_point_t lit_char_to_lower_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p); +lit_code_point_t lit_char_to_upper_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p); #endif /* !LIT_CHAR_HELPERS_H */ diff --git a/jerry-core/lit/lit-unicode-conversions-sup.inc.h b/jerry-core/lit/lit-unicode-conversions-sup.inc.h new file mode 100644 index 000000000..65f597d1e --- /dev/null +++ b/jerry-core/lit/lit-unicode-conversions-sup.inc.h @@ -0,0 +1,30 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This file is automatically generated by the gen-unicode.py script + * from UnicodeData.txt and SpecialCasing.txt files. Do not edit! */ + +/* Contains start points of character case ranges (these are bidirectional conversions). */ +static const uint32_t lit_unicode_character_case_ranges_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x010400, 0x010428, 0x0104b0, 0x0104d8, 0x010c80, 0x010cc0, 0x0118a0, 0x0118c0, 0x016e40, 0x016e60, + 0x01e900, 0x01e922 +}; + +/* Interval lengths of start points in `character_case_ranges` table. */ +static const uint16_t lit_unicode_character_case_range_lengths_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x000028, 0x000024, 0x000033, 0x000020, 0x000020, 0x000022 +}; diff --git a/jerry-core/lit/lit-unicode-conversions.inc.h b/jerry-core/lit/lit-unicode-conversions.inc.h index b0b83a045..bf4287d9a 100644 --- a/jerry-core/lit/lit-unicode-conversions.inc.h +++ b/jerry-core/lit/lit-unicode-conversions.inc.h @@ -14,10 +14,10 @@ */ /* This file is automatically generated by the gen-unicode.py script - * from UnicodeData-13.0.0d6.txt and SpecialCasing-13.0.0d1.txt files. Do not edit! */ + * from UnicodeData.txt and SpecialCasing.txt files. Do not edit! */ /* Contains start points of character case ranges (these are bidirectional conversions). */ -static const uint16_t lit_character_case_ranges[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_character_case_ranges[] JERRY_ATTR_CONST_DATA = { 0x00c0, 0x00e0, 0x00d8, 0x00f8, 0x0189, 0x0256, 0x01b1, 0x028a, 0x0388, 0x03ad, 0x038e, 0x03cd, 0x0391, 0x03b1, 0x03a3, 0x03c3, 0x03fd, 0x037b, 0x0400, 0x0450, @@ -30,7 +30,7 @@ static const uint16_t lit_character_case_ranges[] JERRY_ATTR_CONST_DATA = }; /* Interval lengths of start points in `character_case_ranges` table. */ -static const uint8_t lit_character_case_range_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_character_case_range_lengths[] JERRY_ATTR_CONST_DATA = { 0x0017, 0x0007, 0x0002, 0x0002, 0x0003, 0x0002, 0x0011, 0x0009, 0x0003, 0x0010, 0x0020, 0x0026, 0x0026, 0x0050, 0x0006, 0x002b, 0x0003, 0x0008, 0x0006, 0x0008, @@ -39,7 +39,7 @@ static const uint8_t lit_character_case_range_lengths[] JERRY_ATTR_CONST_DATA = }; /* Contains the start points of bidirectional conversion ranges. */ -static const uint16_t lit_character_pair_ranges[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_character_pair_ranges[] JERRY_ATTR_CONST_DATA = { 0x0100, 0x0132, 0x0139, 0x014a, 0x0179, 0x0182, 0x0187, 0x018b, 0x0191, 0x0198, 0x01a0, 0x01a7, 0x01ac, 0x01af, 0x01b3, 0x01b8, 0x01bc, 0x01cd, 0x01de, 0x01f4, @@ -50,7 +50,7 @@ static const uint16_t lit_character_pair_ranges[] JERRY_ATTR_CONST_DATA = }; /* Interval lengths of start points in `character_pair_ranges` table. */ -static const uint8_t lit_character_pair_range_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_character_pair_range_lengths[] JERRY_ATTR_CONST_DATA = { 0x0030, 0x0006, 0x0010, 0x002e, 0x0006, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002, 0x0006, 0x0002, 0x0002, 0x0002, 0x0004, 0x0002, 0x0002, 0x0010, 0x0012, 0x0002, @@ -61,7 +61,7 @@ static const uint8_t lit_character_pair_range_lengths[] JERRY_ATTR_CONST_DATA = }; /* Contains lower/upper case bidirectional conversion pairs. */ -static const uint16_t lit_character_pairs[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_character_pairs[] JERRY_ATTR_CONST_DATA = { 0x0178, 0x00ff, 0x0181, 0x0253, 0x0186, 0x0254, 0x018e, 0x01dd, 0x018f, 0x0259, 0x0190, 0x025b, 0x0193, 0x0260, 0x0194, 0x0263, 0x0196, 0x0269, 0x0197, 0x0268, @@ -81,20 +81,20 @@ static const uint16_t lit_character_pairs[] JERRY_ATTR_CONST_DATA = /* Contains start points of one-to-two uppercase ranges where the second character * is always the same. */ -static const uint16_t lit_upper_case_special_ranges[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_upper_case_special_ranges[] JERRY_ATTR_CONST_DATA = { 0x1f80, 0x1f08, 0x0399, 0x1f88, 0x1f08, 0x0399, 0x1f90, 0x1f28, 0x0399, 0x1f98, 0x1f28, 0x0399, 0x1fa0, 0x1f68, 0x0399, 0x1fa8, 0x1f68, 0x0399 }; /* Interval lengths for start points in `upper_case_special_ranges` table. */ -static const uint8_t lit_upper_case_special_range_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_upper_case_special_range_lengths[] JERRY_ATTR_CONST_DATA = { 0x0007, 0x0007, 0x0007, 0x0007, 0x0007, 0x0007 }; /* Contains start points of lowercase ranges. */ -static const uint16_t lit_lower_case_ranges[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_lower_case_ranges[] JERRY_ATTR_CONST_DATA = { 0x1e96, 0x1e96, 0x1f80, 0x1f80, 0x1f88, 0x1f80, 0x1f90, 0x1f90, 0x1f98, 0x1f90, 0x1fa0, 0x1fa0, 0x1fa8, 0x1fa0, 0x1fb2, 0x1fb2, 0x1fb6, 0x1fb6, 0x1fc2, 0x1fc2, @@ -103,14 +103,14 @@ static const uint16_t lit_lower_case_ranges[] JERRY_ATTR_CONST_DATA = }; /* Interval lengths for start points in `lower_case_ranges` table. */ -static const uint8_t lit_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA = { 0x0005, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0003, 0x0002, 0x0003, 0x0002, 0x0002, 0x0002, 0x0003, 0x0002, 0x0003, 0x0002, 0x0007, 0x0005 }; /* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */ -static const uint16_t lit_lower_case_conversions[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_lower_case_conversions[] JERRY_ATTR_CONST_DATA = { 0x00df, 0x00df, 0x0149, 0x0149, 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, 0x01f0, 0x01f0, 0x01f2, 0x01f3, 0x0390, 0x0390, 0x03b0, 0x03b0, 0x03f4, 0x03b8, @@ -120,13 +120,13 @@ static const uint16_t lit_lower_case_conversions[] JERRY_ATTR_CONST_DATA = }; /* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */ -static const uint8_t lit_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA = { 0x0016, 0x0001, 0x0000 }; /* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */ -static const uint16_t lit_upper_case_conversions[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_upper_case_conversions[] JERRY_ATTR_CONST_DATA = { 0x00b5, 0x039c, 0x0130, 0x0130, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, 0x01c8, 0x01c7, 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, @@ -157,7 +157,7 @@ static const uint16_t lit_upper_case_conversions[] JERRY_ATTR_CONST_DATA = }; /* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */ -static const uint8_t lit_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA = { 0x001c, 0x002c, 0x0010 }; diff --git a/jerry-core/lit/lit-unicode-ranges-sup.inc.h b/jerry-core/lit/lit-unicode-ranges-sup.inc.h new file mode 100644 index 000000000..6b856948f --- /dev/null +++ b/jerry-core/lit/lit-unicode-ranges-sup.inc.h @@ -0,0 +1,129 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This file is automatically generated by the gen-unicode.py script + * from DerivedCoreProperties.txt. Do not edit! */ + +/** + * Character interval starting points for ID_Start. + */ +static const uint32_t lit_unicode_id_start_interval_starts_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x010000, 0x01000d, 0x010028, 0x01003c, 0x01003f, 0x010050, 0x010080, 0x010140, 0x010280, 0x0102a0, + 0x010300, 0x01032d, 0x010350, 0x010380, 0x0103a0, 0x0103c8, 0x0103d1, 0x010400, 0x0104b0, 0x0104d8, + 0x010500, 0x010530, 0x010600, 0x010740, 0x010760, 0x010800, 0x01080a, 0x010837, 0x01083f, 0x010860, + 0x010880, 0x0108e0, 0x0108f4, 0x010900, 0x010920, 0x010980, 0x0109be, 0x010a10, 0x010a15, 0x010a19, + 0x010a60, 0x010a80, 0x010ac0, 0x010ac9, 0x010b00, 0x010b40, 0x010b60, 0x010b80, 0x010c00, 0x010c80, + 0x010cc0, 0x010d00, 0x010e80, 0x010eb0, 0x010f00, 0x010f30, 0x010fb0, 0x010fe0, 0x011003, 0x011083, + 0x0110d0, 0x011103, 0x011150, 0x011183, 0x0111c1, 0x011200, 0x011213, 0x011280, 0x01128a, 0x01128f, + 0x01129f, 0x0112b0, 0x011305, 0x01130f, 0x011313, 0x01132a, 0x011332, 0x011335, 0x01135d, 0x011400, + 0x011447, 0x01145f, 0x011480, 0x0114c4, 0x011580, 0x0115d8, 0x011600, 0x011680, 0x011700, 0x011800, + 0x0118a0, 0x0118ff, 0x01190c, 0x011915, 0x011918, 0x0119a0, 0x0119aa, 0x011a0b, 0x011a5c, 0x011ac0, + 0x011c00, 0x011c0a, 0x011c72, 0x011d00, 0x011d08, 0x011d0b, 0x011d60, 0x011d67, 0x011d6a, 0x011ee0, + 0x012000, 0x012400, 0x012480, 0x013000, 0x014400, 0x016800, 0x016a40, 0x016ad0, 0x016b00, 0x016b40, + 0x016b63, 0x016b7d, 0x016e40, 0x016f00, 0x016f93, 0x016fe0, 0x017000, 0x018800, 0x018d00, 0x01b000, + 0x01b150, 0x01b164, 0x01b170, 0x01bc00, 0x01bc70, 0x01bc80, 0x01bc90, 0x01d400, 0x01d456, 0x01d49e, + 0x01d4a5, 0x01d4a9, 0x01d4ae, 0x01d4bd, 0x01d4c5, 0x01d507, 0x01d50d, 0x01d516, 0x01d51e, 0x01d53b, + 0x01d540, 0x01d54a, 0x01d552, 0x01d6a8, 0x01d6c2, 0x01d6dc, 0x01d6fc, 0x01d716, 0x01d736, 0x01d750, + 0x01d770, 0x01d78a, 0x01d7aa, 0x01d7c4, 0x01e100, 0x01e137, 0x01e2c0, 0x01e800, 0x01e900, 0x01ee00, + 0x01ee05, 0x01ee21, 0x01ee29, 0x01ee34, 0x01ee4d, 0x01ee51, 0x01ee61, 0x01ee67, 0x01ee6c, 0x01ee74, + 0x01ee79, 0x01ee80, 0x01ee8b, 0x01eea1, 0x01eea5, 0x01eeab, 0x020000, 0x02a700, 0x02b740, 0x02b820, + 0x02ceb0, 0x02f800, 0x030000 +}; + +/** + * Character interval lengths for ID_Start. + */ +static const uint16_t lit_unicode_id_start_interval_lengths_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x00000b, 0x000019, 0x000012, 0x000001, 0x00000e, 0x00000d, 0x00007a, 0x000034, 0x00001c, 0x000030, + 0x00001f, 0x00001d, 0x000025, 0x00001d, 0x000023, 0x000007, 0x000004, 0x00009d, 0x000023, 0x000023, + 0x000027, 0x000033, 0x000136, 0x000015, 0x000007, 0x000005, 0x00002b, 0x000001, 0x000016, 0x000016, + 0x00001e, 0x000012, 0x000001, 0x000015, 0x000019, 0x000037, 0x000001, 0x000003, 0x000002, 0x00001c, + 0x00001c, 0x00001c, 0x000007, 0x00001b, 0x000035, 0x000015, 0x000012, 0x000011, 0x000048, 0x000032, + 0x000032, 0x000023, 0x000029, 0x000001, 0x00001c, 0x000015, 0x000014, 0x000016, 0x000034, 0x00002c, + 0x000018, 0x000023, 0x000022, 0x00002f, 0x000003, 0x000011, 0x000018, 0x000006, 0x000003, 0x00000e, + 0x000009, 0x00002e, 0x000007, 0x000001, 0x000015, 0x000006, 0x000001, 0x000004, 0x000004, 0x000034, + 0x000003, 0x000002, 0x00002f, 0x000001, 0x00002e, 0x000003, 0x00002f, 0x00002a, 0x00001a, 0x00002b, + 0x00003f, 0x000007, 0x000007, 0x000001, 0x000017, 0x000007, 0x000026, 0x000027, 0x00002d, 0x000038, + 0x000008, 0x000024, 0x00001d, 0x000006, 0x000001, 0x000025, 0x000005, 0x000001, 0x00001f, 0x000012, + 0x000399, 0x00006e, 0x0000c3, 0x00042e, 0x000246, 0x000238, 0x00001e, 0x00001d, 0x00002f, 0x000003, + 0x000014, 0x000012, 0x00003f, 0x00004a, 0x00000c, 0x000001, 0x0017f7, 0x0004d5, 0x000008, 0x00011e, + 0x000002, 0x000003, 0x00018b, 0x00006a, 0x00000c, 0x000008, 0x000009, 0x000054, 0x000046, 0x000001, + 0x000001, 0x000003, 0x00000b, 0x000006, 0x000040, 0x000003, 0x000007, 0x000006, 0x00001b, 0x000003, + 0x000004, 0x000006, 0x000153, 0x000018, 0x000018, 0x00001e, 0x000018, 0x00001e, 0x000018, 0x00001e, + 0x000018, 0x00001e, 0x000018, 0x000007, 0x00002c, 0x000006, 0x00002b, 0x0000c4, 0x000043, 0x000003, + 0x00001a, 0x000001, 0x000009, 0x000003, 0x000002, 0x000001, 0x000001, 0x000003, 0x000006, 0x000003, + 0x000003, 0x000009, 0x000010, 0x000002, 0x000004, 0x000010, 0x00a6dd, 0x001034, 0x0000dd, 0x001681, + 0x001d30, 0x00021d, 0x00134a +}; + +/** + * Non-interval characters for ID_Start. + */ +static const uint32_t lit_unicode_id_start_chars_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x010808, 0x01083c, 0x010a00, 0x010f27, 0x011144, 0x011147, 0x011176, 0x0111da, 0x0111dc, 0x011288, + 0x01133d, 0x011350, 0x0114c7, 0x011644, 0x0116b8, 0x011909, 0x01193f, 0x011941, 0x0119e1, 0x0119e3, + 0x011a00, 0x011a3a, 0x011a50, 0x011a9d, 0x011c40, 0x011d46, 0x011d98, 0x011fb0, 0x016f50, 0x016fe3, + 0x01d4a2, 0x01d4bb, 0x01d546, 0x01e14e, 0x01e94b, 0x01ee24, 0x01ee27, 0x01ee39, 0x01ee3b, 0x01ee42, + 0x01ee47, 0x01ee49, 0x01ee4b, 0x01ee54, 0x01ee57, 0x01ee59, 0x01ee5b, 0x01ee5d, 0x01ee5f, 0x01ee64, + 0x01ee7e +}; + +/** + * Character interval starting points for ID_Continue. + */ +static const uint32_t lit_unicode_id_continue_interval_starts_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x010376, 0x0104a0, 0x010a01, 0x010a05, 0x010a0c, 0x010a38, 0x010ae5, 0x010d24, 0x010d30, 0x010eab, + 0x010f46, 0x011000, 0x011038, 0x011066, 0x01107f, 0x0110b0, 0x0110f0, 0x011100, 0x011127, 0x011136, + 0x011145, 0x011180, 0x0111b3, 0x0111c9, 0x0111ce, 0x01122c, 0x0112df, 0x0112f0, 0x011300, 0x01133b, + 0x01133e, 0x011347, 0x01134b, 0x011362, 0x011366, 0x011370, 0x011435, 0x011450, 0x0114b0, 0x0114d0, + 0x0115af, 0x0115b8, 0x0115dc, 0x011630, 0x011650, 0x0116ab, 0x0116c0, 0x01171d, 0x011730, 0x01182c, + 0x0118e0, 0x011930, 0x011937, 0x01193b, 0x011942, 0x011950, 0x0119d1, 0x0119da, 0x011a01, 0x011a33, + 0x011a3b, 0x011a51, 0x011a8a, 0x011c2f, 0x011c38, 0x011c50, 0x011c92, 0x011ca9, 0x011d31, 0x011d3c, + 0x011d3f, 0x011d50, 0x011d8a, 0x011d90, 0x011d93, 0x011da0, 0x011ef3, 0x016a60, 0x016af0, 0x016b30, + 0x016b50, 0x016f51, 0x016f8f, 0x016ff0, 0x01bc9d, 0x01d165, 0x01d16d, 0x01d17b, 0x01d185, 0x01d1aa, + 0x01d242, 0x01d7ce, 0x01da00, 0x01da3b, 0x01da9b, 0x01daa1, 0x01e000, 0x01e008, 0x01e01b, 0x01e023, + 0x01e026, 0x01e130, 0x01e140, 0x01e2ec, 0x01e8d0, 0x01e944, 0x01e950, 0x01fbf0, 0x0e0100 +}; + +/** + * Character interval lengths for ID_Continue. + */ +static const uint16_t lit_unicode_id_continue_interval_lengths_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x000004, 0x000009, 0x000002, 0x000001, 0x000003, 0x000002, 0x000001, 0x000003, 0x000009, 0x000001, + 0x00000a, 0x000002, 0x00000e, 0x000009, 0x000003, 0x00000a, 0x000009, 0x000002, 0x00000d, 0x000009, + 0x000001, 0x000002, 0x00000d, 0x000003, 0x00000b, 0x00000b, 0x00000b, 0x000009, 0x000003, 0x000001, + 0x000006, 0x000001, 0x000002, 0x000001, 0x000006, 0x000004, 0x000011, 0x000009, 0x000013, 0x000009, + 0x000006, 0x000008, 0x000001, 0x000010, 0x000009, 0x00000c, 0x000009, 0x00000e, 0x000009, 0x00000e, + 0x000009, 0x000005, 0x000001, 0x000003, 0x000001, 0x000009, 0x000006, 0x000006, 0x000009, 0x000006, + 0x000003, 0x00000a, 0x00000f, 0x000007, 0x000007, 0x000009, 0x000015, 0x00000d, 0x000005, 0x000001, + 0x000006, 0x000009, 0x000004, 0x000001, 0x000004, 0x000009, 0x000003, 0x000009, 0x000004, 0x000006, + 0x000009, 0x000036, 0x000003, 0x000001, 0x000001, 0x000004, 0x000005, 0x000007, 0x000006, 0x000003, + 0x000002, 0x000031, 0x000036, 0x000031, 0x000004, 0x00000e, 0x000006, 0x000010, 0x000006, 0x000001, + 0x000004, 0x000006, 0x000009, 0x00000d, 0x000006, 0x000006, 0x000009, 0x000009, 0x0000ef +}; + +/** + * Non-interval characters for ID_Continue. + */ +static const uint32_t lit_unicode_id_continue_chars_sup[] JERRY_ATTR_CONST_DATA = +{ + 0x0101fd, 0x0102e0, 0x010a3f, 0x011173, 0x01123e, 0x011357, 0x01145e, 0x011940, 0x0119e4, 0x011a47, + 0x011d3a, 0x011d47, 0x016f4f, 0x016fe4, 0x01da75, 0x01da84 +}; diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h index 49d402a23..de34c3d5e 100644 --- a/jerry-core/lit/lit-unicode-ranges.inc.h +++ b/jerry-core/lit/lit-unicode-ranges.inc.h @@ -14,15 +14,12 @@ */ /* This file is automatically generated by the gen-unicode.py script - * from UnicodeData-13.0.0d6.txt. Do not edit! */ + * from DerivedCoreProperties.txt. Do not edit! */ /** - * Character interval starting points for the unicode letters. - * - * The characters covered by these intervals are from - * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl + * Character interval starting points for ID_Start. */ -static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_id_start_interval_starts[] JERRY_ATTR_CONST_DATA = { 0x00c0, 0x00d8, 0x00f8, 0x01f8, 0x02c6, 0x02e0, 0x0370, 0x0376, 0x037a, 0x0388, 0x038e, 0x03a3, 0x03f7, 0x048a, 0x0531, 0x0560, 0x05d0, 0x05ef, 0x0620, 0x066e, @@ -39,46 +36,43 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA = 0x10fc, 0x11fc, 0x124a, 0x1250, 0x125a, 0x1260, 0x128a, 0x1290, 0x12b2, 0x12b8, 0x12c2, 0x12c8, 0x12d8, 0x1312, 0x1318, 0x1380, 0x13a0, 0x13f8, 0x1401, 0x1501, 0x1601, 0x166f, 0x1681, 0x16a0, 0x16ee, 0x1700, 0x170e, 0x1720, 0x1740, 0x1760, - 0x176e, 0x1780, 0x1820, 0x1880, 0x1887, 0x18b0, 0x1900, 0x1950, 0x1970, 0x1980, - 0x19b0, 0x1a00, 0x1a20, 0x1b05, 0x1b45, 0x1b83, 0x1bae, 0x1bba, 0x1c00, 0x1c4d, - 0x1c5a, 0x1c80, 0x1c90, 0x1cbd, 0x1ce9, 0x1cee, 0x1cf5, 0x1d00, 0x1e00, 0x1f00, - 0x1f18, 0x1f20, 0x1f48, 0x1f50, 0x1f5f, 0x1f80, 0x1fb6, 0x1fc2, 0x1fc6, 0x1fd0, - 0x1fd6, 0x1fe0, 0x1ff2, 0x1ff6, 0x2090, 0x210a, 0x2119, 0x212a, 0x212f, 0x213c, - 0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80, - 0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021, - 0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0, - 0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00, - 0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, - 0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100, - 0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00, - 0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500, - 0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00, - 0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900, - 0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300, - 0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00, - 0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700, - 0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100, - 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f, - 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7c2, 0xa7f5, 0xa803, 0xa807, 0xa80c, 0xa840, - 0xa882, 0xa8f2, 0xa8fd, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, - 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, - 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, - 0xae00, 0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, - 0xb800, 0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, - 0xc200, 0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, - 0xcc00, 0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, - 0xd600, 0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, - 0xfb2a, 0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, - 0xfe70, 0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda + 0x176e, 0x1780, 0x1820, 0x1880, 0x18b0, 0x1900, 0x1950, 0x1970, 0x1980, 0x19b0, + 0x1a00, 0x1a20, 0x1b05, 0x1b45, 0x1b83, 0x1bae, 0x1bba, 0x1c00, 0x1c4d, 0x1c5a, + 0x1c80, 0x1c90, 0x1cbd, 0x1ce9, 0x1cee, 0x1cf5, 0x1d00, 0x1e00, 0x1f00, 0x1f18, + 0x1f20, 0x1f48, 0x1f50, 0x1f5f, 0x1f80, 0x1fb6, 0x1fc2, 0x1fc6, 0x1fd0, 0x1fd6, + 0x1fe0, 0x1ff2, 0x1ff6, 0x2090, 0x210a, 0x2118, 0x212a, 0x213c, 0x2145, 0x2160, + 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80, 0x2da0, 0x2da8, + 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021, 0x3031, 0x3038, + 0x3041, 0x309b, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0, 0x3400, 0x3500, + 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00, 0x3e00, 0x3f00, + 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, + 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100, 0x5200, 0x5300, + 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00, 0x5c00, 0x5d00, + 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500, 0x6600, 0x6700, + 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00, 0x7000, 0x7100, + 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900, 0x7a00, 0x7b00, + 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300, 0x8400, 0x8500, + 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00, 0x8e00, 0x8f00, + 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700, 0x9800, 0x9900, + 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100, 0xa200, 0xa300, + 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f, 0xa6a0, 0xa717, + 0xa722, 0xa78b, 0xa7c2, 0xa7f5, 0xa803, 0xa807, 0xa80c, 0xa840, 0xa882, 0xa8f2, + 0xa8fd, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00, 0xaa40, + 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01, 0xab09, + 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00, 0xaf00, + 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800, 0xb900, + 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200, 0xc300, + 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00, 0xcd00, + 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600, 0xd700, + 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, + 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, + 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda }; /** - * Character lengths for the unicode letters. - * - * The characters covered by these intervals are from - * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl + * Character interval lengths for ID_Start. */ -static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_id_start_interval_lengths[] JERRY_ATTR_CONST_DATA = { 0x0016, 0x001e, 0x00ff, 0x00c9, 0x000b, 0x0004, 0x0004, 0x0001, 0x0003, 0x0002, 0x0013, 0x0052, 0x008a, 0x00a5, 0x0025, 0x0028, 0x001a, 0x0003, 0x002a, 0x0001, @@ -95,17 +89,17 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA 0x00ff, 0x004c, 0x0003, 0x0006, 0x0003, 0x0028, 0x0003, 0x0020, 0x0003, 0x0006, 0x0003, 0x000e, 0x0038, 0x0003, 0x0042, 0x000f, 0x0055, 0x0005, 0x00ff, 0x00ff, 0x006b, 0x0010, 0x0019, 0x004a, 0x000a, 0x000c, 0x0003, 0x0011, 0x0011, 0x000c, - 0x0002, 0x0033, 0x0058, 0x0004, 0x0021, 0x0045, 0x001e, 0x001d, 0x0004, 0x002b, - 0x0019, 0x0016, 0x0034, 0x002e, 0x0006, 0x001d, 0x0001, 0x002b, 0x0023, 0x0002, - 0x0023, 0x0008, 0x002a, 0x0002, 0x0003, 0x0005, 0x0001, 0x00bf, 0x00ff, 0x0015, - 0x0005, 0x0025, 0x0005, 0x0007, 0x001e, 0x0034, 0x0006, 0x0002, 0x0006, 0x0003, - 0x0005, 0x000c, 0x0002, 0x0006, 0x000c, 0x0009, 0x0004, 0x0003, 0x000a, 0x0003, - 0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016, - 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008, - 0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x002a, 0x005d, 0x001f, 0x000f, + 0x0002, 0x0033, 0x0058, 0x0028, 0x0045, 0x001e, 0x001d, 0x0004, 0x002b, 0x0019, + 0x0016, 0x0034, 0x002e, 0x0006, 0x001d, 0x0001, 0x002b, 0x0023, 0x0002, 0x0023, + 0x0008, 0x002a, 0x0002, 0x0003, 0x0005, 0x0001, 0x00bf, 0x00ff, 0x0015, 0x0005, + 0x0025, 0x0005, 0x0007, 0x001e, 0x0034, 0x0006, 0x0002, 0x0006, 0x0003, 0x0005, + 0x000c, 0x0002, 0x0006, 0x000c, 0x0009, 0x0005, 0x000f, 0x0003, 0x0004, 0x0028, + 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016, 0x0006, 0x0006, + 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008, 0x0004, 0x0004, + 0x0055, 0x0004, 0x0059, 0x0003, 0x002a, 0x005d, 0x001f, 0x000f, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, - 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00bf, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x00ff, 0x00bf, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, @@ -113,29 +107,25 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, - 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00fc, 0x00ff, 0x00ff, - 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e, - 0x004f, 0x0008, 0x0066, 0x0034, 0x0008, 0x000c, 0x0002, 0x0003, 0x0016, 0x0033, - 0x0031, 0x0005, 0x0001, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, - 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, - 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x000d, 0x0072, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00fc, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e, 0x004f, 0x0008, + 0x0066, 0x0034, 0x0008, 0x000c, 0x0002, 0x0003, 0x0016, 0x0033, 0x0031, 0x0005, + 0x0001, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028, 0x0002, + 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005, 0x0005, + 0x0005, 0x0006, 0x0006, 0x002a, 0x000d, 0x0072, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, - 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, - 0x00ff, 0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, - 0x000c, 0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, - 0x0004, 0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002 + 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00a3, + 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, + 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, + 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002 }; /** - * Those unicode letter characters that are not inside any of - * the intervals specified in lit_unicode_letter_interval_sps array. - * - * The characters are from the following Unicode categories: - * Lu, Ll, Lt, Lm, Lo, Nl + * Non-interval characters for ID_Start. */ -static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_id_start_chars[] JERRY_ATTR_CONST_DATA = { 0x00aa, 0x00b5, 0x00ba, 0x02ec, 0x02ee, 0x037f, 0x0386, 0x038c, 0x0559, 0x06d5, 0x06ff, 0x0710, 0x07b1, 0x07fa, 0x081a, 0x0824, 0x0828, 0x093d, 0x0950, 0x09b2, @@ -144,18 +134,13 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA = 0x0ea5, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7, 0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1cfa, 0x1f59, 0x1f5b, 0x1f5d, 0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e, 0x2d27, - 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa9cf, 0xaa7a, 0xaab1, 0xaac0, - 0xaac2, 0xac00, 0xfb1d, 0xfb3e + 0x2d2d, 0x2d6f, 0xa8fb, 0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xfb1d, 0xfb3e }; /** - * Character interval starting points for non-letter character - * that can be used as a non-first character of an identifier. - * - * The characters covered by these intervals are from - * the following Unicode categories: Nd, Mn, Mc, Pc + * Character interval starting points for ID_Continue. */ -static const uint16_t lit_unicode_non_letter_ident_part_interval_sps[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_id_continue_interval_starts[] JERRY_ATTR_CONST_DATA = { 0x0300, 0x0483, 0x0591, 0x05c1, 0x05c4, 0x0610, 0x064b, 0x06d6, 0x06df, 0x06e7, 0x06ea, 0x06f0, 0x0730, 0x07a6, 0x07c0, 0x07eb, 0x0816, 0x081b, 0x0825, 0x0829, @@ -167,8 +152,8 @@ static const uint16_t lit_unicode_non_letter_ident_part_interval_sps[] JERRY_ATT 0x0ce6, 0x0d00, 0x0d3b, 0x0d3e, 0x0d46, 0x0d4a, 0x0d62, 0x0d66, 0x0d81, 0x0dcf, 0x0dd8, 0x0de6, 0x0df2, 0x0e34, 0x0e47, 0x0e50, 0x0eb4, 0x0ec8, 0x0ed0, 0x0f18, 0x0f20, 0x0f3e, 0x0f71, 0x0f86, 0x0f8d, 0x0f99, 0x102b, 0x1040, 0x1056, 0x105e, - 0x1062, 0x1067, 0x1071, 0x1082, 0x108f, 0x135d, 0x1712, 0x1732, 0x1752, 0x1772, - 0x17b4, 0x17e0, 0x180b, 0x1810, 0x1885, 0x1920, 0x1930, 0x1946, 0x19d0, 0x1a17, + 0x1062, 0x1067, 0x1071, 0x1082, 0x108f, 0x135d, 0x1369, 0x1712, 0x1732, 0x1752, + 0x1772, 0x17b4, 0x17e0, 0x180b, 0x1810, 0x1920, 0x1930, 0x1946, 0x19d0, 0x1a17, 0x1a55, 0x1a60, 0x1a7f, 0x1a90, 0x1ab0, 0x1abf, 0x1b00, 0x1b34, 0x1b50, 0x1b6b, 0x1b80, 0x1ba1, 0x1bb0, 0x1be6, 0x1c24, 0x1c40, 0x1c50, 0x1cd0, 0x1cd4, 0x1cf7, 0x1dc0, 0x1dfb, 0x200c, 0x203f, 0x20d0, 0x20e5, 0x2cef, 0x2de0, 0x302a, 0x3099, @@ -179,13 +164,9 @@ static const uint16_t lit_unicode_non_letter_ident_part_interval_sps[] JERRY_ATT }; /** - * Character interval lengths for non-letter character - * that can be used as a non-first character of an identifier. - * - * The characters covered by these intervals are from - * the following Unicode categories: Nd, Mn, Mc, Pc + * Character interval lengths for ID_Continue. */ -static const uint8_t lit_unicode_non_letter_ident_part_interval_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_id_continue_interval_lengths[] JERRY_ATTR_CONST_DATA = { 0x006f, 0x0004, 0x002c, 0x0001, 0x0001, 0x000a, 0x001e, 0x0006, 0x0005, 0x0001, 0x0003, 0x0009, 0x001a, 0x000a, 0x0009, 0x0008, 0x0003, 0x0008, 0x0002, 0x0004, @@ -197,8 +178,8 @@ static const uint8_t lit_unicode_non_letter_ident_part_interval_lengths[] JERRY_ 0x0009, 0x0003, 0x0001, 0x0006, 0x0002, 0x0003, 0x0001, 0x0009, 0x0002, 0x0005, 0x0007, 0x0009, 0x0001, 0x0006, 0x0007, 0x0009, 0x0008, 0x0005, 0x0009, 0x0001, 0x0009, 0x0001, 0x0013, 0x0001, 0x000a, 0x0023, 0x0013, 0x0009, 0x0003, 0x0002, - 0x0002, 0x0006, 0x0003, 0x000b, 0x000e, 0x0002, 0x0002, 0x0002, 0x0001, 0x0001, - 0x001f, 0x0009, 0x0002, 0x0009, 0x0001, 0x000b, 0x000b, 0x0009, 0x0009, 0x0004, + 0x0002, 0x0006, 0x0003, 0x000b, 0x000e, 0x0002, 0x0008, 0x0002, 0x0002, 0x0001, + 0x0001, 0x001f, 0x0009, 0x0002, 0x0009, 0x000b, 0x000b, 0x0009, 0x000a, 0x0004, 0x0009, 0x001c, 0x000a, 0x0009, 0x000d, 0x0001, 0x0004, 0x0010, 0x0009, 0x0008, 0x0002, 0x000c, 0x0009, 0x000d, 0x0013, 0x0009, 0x0009, 0x0002, 0x0014, 0x0002, 0x0039, 0x0004, 0x0001, 0x0001, 0x000c, 0x000b, 0x0002, 0x001f, 0x0005, 0x0001, @@ -209,45 +190,65 @@ static const uint8_t lit_unicode_non_letter_ident_part_interval_lengths[] JERRY_ }; /** - * Those non-letter characters that can be used as a non-first - * character of an identifier and not included in any of the intervals - * specified in lit_unicode_non_letter_ident_part_interval_sps array. - * - * The characters are from the following Unicode categories: - * Nd, Mn, Mc, Pc + * Non-interval characters for ID_Continue. */ -static const uint16_t lit_unicode_non_letter_ident_part_chars[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_id_continue_chars[] JERRY_ATTR_CONST_DATA = { - 0x05bf, 0x05c7, 0x0670, 0x0711, 0x07fd, 0x09bc, 0x09d7, 0x09fe, 0x0a3c, 0x0a51, - 0x0a75, 0x0abc, 0x0b3c, 0x0b82, 0x0bd7, 0x0cbc, 0x0d57, 0x0dca, 0x0dd6, 0x0e31, - 0x0eb1, 0x0f35, 0x0f37, 0x0f39, 0x0fc6, 0x17dd, 0x18a9, 0x1ced, 0x1cf4, 0x2054, - 0x20e1, 0x2d7f, 0xa66f, 0xa802, 0xa806, 0xa80b, 0xa82c, 0xa9e5, 0xaa43, 0xaab0, - 0xaac1, 0xfb1e, 0xff3f + 0x00b7, 0x0387, 0x05bf, 0x05c7, 0x0670, 0x0711, 0x07fd, 0x09bc, 0x09d7, 0x09fe, + 0x0a3c, 0x0a51, 0x0a75, 0x0abc, 0x0b3c, 0x0b82, 0x0bd7, 0x0cbc, 0x0d57, 0x0dca, + 0x0dd6, 0x0e31, 0x0eb1, 0x0f35, 0x0f37, 0x0f39, 0x0fc6, 0x17dd, 0x18a9, 0x1ced, + 0x1cf4, 0x2054, 0x20e1, 0x2d7f, 0xa66f, 0xa802, 0xa806, 0xa80b, 0xa82c, 0xa9e5, + 0xaa43, 0xaab0, 0xaac1, 0xfb1e, 0xff3f }; +#if ENABLED (JERRY_ESNEXT) /** - * Unicode separator character interval starting points from Unicode category: Zs + * Character interval starting points for White_Space. */ -static const uint16_t lit_unicode_separator_char_interval_sps[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_white_space_interval_starts[] JERRY_ATTR_CONST_DATA = { 0x2000 }; /** - * Unicode separator character interval lengths from Unicode category: Zs + * Character interval lengths for White_Space. */ -static const uint8_t lit_unicode_separator_char_interval_lengths[] JERRY_ATTR_CONST_DATA = +static const uint8_t lit_unicode_white_space_interval_lengths[] JERRY_ATTR_CONST_DATA = +{ + 0x000a +}; + +/** + * Non-interval characters for White_Space. + */ +static const uint16_t lit_unicode_white_space_chars[] JERRY_ATTR_CONST_DATA = +{ + 0x00a0, 0x1680, 0x202f, 0x205f, 0x3000 +}; + +#else /* !ENABLED (JERRY_ESNEXT) */ +/** + * Character interval starting points for White_Space. + */ +static const uint16_t lit_unicode_white_space_interval_starts[] JERRY_ATTR_CONST_DATA = +{ + 0x2000 +}; + +/** + * Character interval lengths for White_Space. + */ +static const uint8_t lit_unicode_white_space_interval_lengths[] JERRY_ATTR_CONST_DATA = { 0x000b }; /** - * Unicode separator characters that are not in the - * lit_unicode_separator_char_intervals array. - * - * Unicode category: Zs + * Non-interval characters for White_Space. */ -static const uint16_t lit_unicode_separator_chars[] JERRY_ATTR_CONST_DATA = +static const uint16_t lit_unicode_white_space_chars[] JERRY_ATTR_CONST_DATA = { 0x1680, 0x180e, 0x202f, 0x205f, 0x3000 }; + +#endif /* ENABLED (JERRY_ESNEXT) */ diff --git a/tests/jerry/es.next/string-upper-lower-case-conversion.js b/tests/jerry/es.next/string-upper-lower-case-conversion.js new file mode 100644 index 000000000..0f764b1ea --- /dev/null +++ b/tests/jerry/es.next/string-upper-lower-case-conversion.js @@ -0,0 +1,66 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +let start = 0x10000 +let end = 0x10FFFF + +const lower_expected = [66560, 66561, 66562, 66563, 66564, 66565, 66566, 66567, 66568, 66569, 66570, 66571, 66572, + 66573, 66574, 66575, 66576, 66577, 66578, 66579, 66580, 66581, 66582, 66583, 66584, 66585, + 66586, 66587, 66588, 66589, 66590, 66591, 66592, 66593, 66594, 66595, 66596, 66597, 66598, + 66599, 66736, 66737, 66738, 66739, 66740, 66741, 66742, 66743, 66744, 66745, 66746, 66747, + 66748, 66749, 66750, 66751, 66752, 66753, 66754, 66755, 66756, 66757, 66758, 66759, 66760, + 66761, 66762, 66763, 66764, 66765, 66766, 66767, 66768, 66769, 66770, 66771, 68736, 68737, + 68738, 68739, 68740, 68741, 68742, 68743, 68744, 68745, 68746, 68747, 68748, 68749, 68750, + 68751, 68752, 68753, 68754, 68755, 68756, 68757, 68758, 68759, 68760, 68761, 68762, 68763, + 68764, 68765, 68766, 68767, 68768, 68769, 68770, 68771, 68772, 68773, 68774, 68775, 68776, + 68777, 68778, 68779, 68780, 68781, 68782, 68783, 68784, 68785, 68786, 71840, 71841, 71842, + 71843, 71844, 71845, 71846, 71847, 71848, 71849, 71850, 71851, 71852, 71853, 71854, 71855, + 71856, 71857, 71858, 71859, 71860, 71861, 71862, 71863, 71864, 71865, 71866, 71867, 71868, + 71869, 71870, 71871, 93760, 93761, 93762, 93763, 93764, 93765, 93766, 93767, 93768, 93769, + 93770, 93771, 93772, 93773, 93774, 93775, 93776, 93777, 93778, 93779, 93780, 93781, 93782, + 93783, 93784, 93785, 93786, 93787, 93788, 93789, 93790, 93791, 125184, 125185, 125186, 125187, + 125188, 125189, 125190, 125191, 125192, 125193, 125194, 125195, 125196, 125197, 125198, 125199, + 125200, 125201, 125202, 125203, 125204, 125205, 125206, 125207, 125208, 125209, 125210, 125211, + 125212, 125213, 125214, 125215, 125216, 125217]; + +const upper_expected = [66600, 66601, 66602, 66603, 66604, 66605, 66606, 66607, 66608, 66609, 66610, 66611, 66612, + 66613, 66614, 66615, 66616, 66617, 66618, 66619, 66620, 66621, 66622, 66623, 66624, 66625, + 66626, 66627, 66628, 66629, 66630, 66631, 66632, 66633, 66634, 66635, 66636, 66637, 66638, + 66639, 66776, 66777, 66778, 66779, 66780, 66781, 66782, 66783, 66784, 66785, 66786, 66787, + 66788, 66789, 66790, 66791, 66792, 66793, 66794, 66795, 66796, 66797, 66798, 66799, 66800, + 66801, 66802, 66803, 66804, 66805, 66806, 66807, 66808, 66809, 66810, 66811, 68800, 68801, + 68802, 68803, 68804, 68805, 68806, 68807, 68808, 68809, 68810, 68811, 68812, 68813, 68814, + 68815, 68816, 68817, 68818, 68819, 68820, 68821, 68822, 68823, 68824, 68825, 68826, 68827, + 68828, 68829, 68830, 68831, 68832, 68833, 68834, 68835, 68836, 68837, 68838, 68839, 68840, + 68841, 68842, 68843, 68844, 68845, 68846, 68847, 68848, 68849, 68850, 71872, 71873, 71874, + 71875, 71876, 71877, 71878, 71879, 71880, 71881, 71882, 71883, 71884, 71885, 71886, 71887, + 71888, 71889, 71890, 71891, 71892, 71893, 71894, 71895, 71896, 71897, 71898, 71899, 71900, + 71901, 71902, 71903, 93792, 93793, 93794, 93795, 93796, 93797, 93798, 93799, 93800, 93801, + 93802, 93803, 93804, 93805, 93806, 93807, 93808, 93809, 93810, 93811, 93812, 93813, 93814, + 93815, 93816, 93817, 93818, 93819, 93820, 93821, 93822, 93823, 125218, 125219, 125220, 125221, + 125222, 125223, 125224, 125225, 125226, 125227, 125228, 125229, 125230, 125231, 125232, 125233, + 125234, 125235, 125236, 125237, 125238, 125239, 125240, 125241, 125242, 125243, 125244, 125245, + 125246, 125247, 125248, 125249, 125250, 125251]; + +for (let iter of lower_expected) { + let cp = String.fromCodePoint(iter); + assert(cp !== cp.toLowerCase()); +} + +for (let iter of upper_expected) { + let cp = String.fromCodePoint(iter); + assert(cp !== cp.toUpperCase()); +} + +assert("\ud801A".toLowerCase() === "\ud801a"); diff --git a/tests/jerry/es.next/unicode-escape-identifiers.js b/tests/jerry/es.next/unicode-escape-identifiers.js new file mode 100644 index 000000000..5bcdfe2c1 --- /dev/null +++ b/tests/jerry/es.next/unicode-escape-identifiers.js @@ -0,0 +1,32 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var \u{102C0} = 2; +assert(\u{102C0} === 2); + +var o1 = { \u{102C0} : 3 }; +assert(o1['\ud800\udec0'] === 3); + +var o2 = { '\ud800\udec0' : 4 }; +assert(o2.\u{102C0} === 4); + +try { + eval('var βΈ―'); + assert(false); +} catch(e) { + assert(e instanceof SyntaxError); +} + +var 𐋀 = 5; +assert(𐋀 === 5); diff --git a/tests/jerry/es5.1/string-upper-lower-case-conversion.js b/tests/jerry/es5.1/string-upper-lower-case-conversion.js new file mode 100644 index 000000000..c931abb28 --- /dev/null +++ b/tests/jerry/es5.1/string-upper-lower-case-conversion.js @@ -0,0 +1,20 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Although codepoint 0x10400 and 0x10428 are an upper-lowercase pair, +// we must not do their conversion in JavaScript. We must also ignore +// stray surrogates. + +assert ("\ud801\ud801\udc00\udc00".toLowerCase() == "\ud801\ud801\udc00\udc00"); +assert ("\ud801\ud801\udc28\udc28".toUpperCase() == "\ud801\ud801\udc28\udc28"); diff --git a/tests/jerry/string-upper-lower-case-conversion.js b/tests/jerry/string-upper-lower-case-conversion.js index 75a6dea92..1e592e86b 100644 --- a/tests/jerry/string-upper-lower-case-conversion.js +++ b/tests/jerry/string-upper-lower-case-conversion.js @@ -84,13 +84,6 @@ assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toLower assert ("0123456789abcdefghijklmnopqrstuvwxzyABCDEFGHIJKLMNOPQRSTUVWXYZ".toUpperCase() == "0123456789ABCDEFGHIJKLMNOPQRSTUVWXZYABCDEFGHIJKLMNOPQRSTUVWXYZ"); -// Although codepoint 0x10400 and 0x10428 are an upper-lowercase pair, -// we must not do their conversion in JavaScript. We must also ignore -// stray surrogates. - -assert ("\ud801\ud801\udc00\udc00".toLowerCase() == "\ud801\ud801\udc00\udc00"); -assert ("\ud801\ud801\udc28\udc28".toUpperCase() == "\ud801\ud801\udc28\udc28"); - // Conversion of non-string objects. assert (String.prototype.toUpperCase.call(true) == "TRUE"); diff --git a/tests/test262-es6-excludelist.xml b/tests/test262-es6-excludelist.xml index 9907772f0..87dfc14df 100644 --- a/tests/test262-es6-excludelist.xml +++ b/tests/test262-es6-excludelist.xml @@ -123,11 +123,7 @@ - - - - @@ -377,4 +373,16 @@ ES2019 change: catch without parameter is allowed + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character + Unicode 13: 0x180E is no longer whitespace character diff --git a/tools/gen-unicode.py b/tools/gen-unicode.py index 47624df8e..804c0ff73 100755 --- a/tools/gen-unicode.py +++ b/tools/gen-unicode.py @@ -17,10 +17,10 @@ from __future__ import print_function import argparse -import bisect import csv import itertools import os +import re import warnings from gen_c_source import LICENSE, format_code @@ -28,268 +28,286 @@ from settings import PROJECT_DIR RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') +RANGES_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges-sup.inc.h') CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h') +CONVERSIONS_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions-sup.inc.h') +UNICODE_PLANE_TYPE_BASIC = 0 +UNICODE_PLANE_TYPE_SUPPLEMENTARY = 1 + +# For ES5.1 profile we use a predefined subset of whitespace characters +ES5_1_WHITE_SPACE_UNITS = [0x1680, 0x180e] +ES5_1_WHITE_SPACE_UNITS.extend(range(0x2000, 0x200c)) +ES5_1_WHITE_SPACE_UNITS.extend([0x202f, 0x205f, 0x3000]) # common code generation +class UnicodeBasicSource(object): + # pylint: disable=too-many-instance-attributes + def __init__(self, filepath, character_type="uint16_t", length_type="uint8_t"): + self._filepath = filepath + self._header = [LICENSE, ""] + self._data = [] + self._table_name_suffix = "" + self.character_type = character_type + self.length_type = length_type -class UniCodeSource(object): - def __init__(self, filepath): - self.__filepath = filepath - self.__header = [LICENSE, ""] - self.__data = [] + self._range_table_types = [self.character_type, + self.length_type, + self.character_type] + self._range_table_names = ["interval_starts", + "interval_lengths", + "chars"] + self._range_table_descriptions = ["Character interval starting points for", + "Character interval lengths for", + "Non-interval characters for"] + + self._conversion_range_types = [self.character_type, + self.length_type] + self._conversion_range_names = ["ranges", + "range_lengths"] def complete_header(self, completion): - self.__header.append(completion) - self.__header.append("") # for an extra empty line + self._header.append(completion) + self._header.append("") # for an extra empty line - def add_table(self, table, table_name, table_type, table_descr): - self.__data.append(table_descr) - self.__data.append("static const %s lit_%s[] JERRY_ATTR_CONST_DATA =" % (table_type, table_name)) - self.__data.append("{") - self.__data.append(format_code(table, 1)) - self.__data.append("};") - self.__data.append("") # for an extra empty line + def add_whitepace_range(self, category, categorizer, units): + self._data.append("#if ENABLED (JERRY_ESNEXT)") + self.add_range(category, categorizer.create_tables(units)) + self._data.append("#else /* !ENABLED (JERRY_ESNEXT) */") + self.add_range(category, categorizer.create_tables(ES5_1_WHITE_SPACE_UNITS)) + self._data.append("#endif /* ENABLED (JERRY_ESNEXT) */\n") + + def add_range(self, category, tables): + idx = 0 + for table in tables: + self.add_table(table, + "/**\n * %s %s.\n */" % (self._range_table_descriptions[idx], category), + self._range_table_types[idx], + category, + self._range_table_names[idx]) + idx += 1 + + def add_conversion_range(self, category, tables, descriptions): + self.add_named_conversion_range(category, tables, self._conversion_range_names, descriptions) + + def add_named_conversion_range(self, category, tables, table_names, descriptions): + idx = 0 + for table in tables: + self.add_table(table, + descriptions[idx], + self._conversion_range_types[idx], + category, + table_names[idx]) + idx += 1 + + def add_table(self, table, description, table_type, category, table_name): + if table and sum(table) != 0: + self._data.append(description) + self._data.append("static const %s lit_unicode_%s%s%s[] JERRY_ATTR_CONST_DATA =" + % (table_type, + category.lower(), + "_" + table_name if table_name else "", + self._table_name_suffix)) + self._data.append("{") + self._data.append(format_code(table, 1, 6 if self._table_name_suffix else 4)) + self._data.append("};") + self._data.append("") # for an extra empty line def generate(self): - with open(self.__filepath, 'w') as generated_source: - generated_source.write("\n".join(self.__header)) - generated_source.write("\n".join(self.__data)) + with open(self._filepath, 'w') as generated_source: + generated_source.write("\n".join(self._header)) + generated_source.write("\n".join(self._data)) -class UnicodeCategorizer(object): + +class UnicodeSupplementarySource(UnicodeBasicSource): + def __init__(self, filepath): + UnicodeBasicSource.__init__(self, filepath, "uint32_t", "uint16_t") + self._table_name_suffix = "_sup" + + def add_whitepace_range(self, category, categorizer, units): + self.add_range(category, categorizer.create_tables(units)) + +class UnicodeBasicCategorizer(object): def __init__(self): - # unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs - # Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So - # letter: Lu Ll Lt Lm Lo Nl - # non-letter-indent-part: - # digit: Nd - # punctuation mark: Mn Mc - # connector punctuation: Pc - # separators: Zs - self._unicode_categories = { - 'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"], - 'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"], - 'separators_category' : ["Zs"] - } + self._length_limit = 0xff + self.extra_id_continue_units = set([0x200C, 0x200D]) - self._categories = { - 'letters' : [], - 'non_letters' : [], - 'separators' : [] - } + #pylint: disable=no-self-use + def in_range(self, i): + return i >= 0x80 and i < 0x10000 - def _store_by_category(self, unicode_id, category): + def _group_ranges(self, units): """ - Store the given unicode_id by its category + Convert an increasing list of integers into a range list + :return: List of ranges. """ - for target_category in self._categories: - if category in self._unicode_categories[target_category + '_category']: - self._categories[target_category].append(unicode_id) + for _, group in itertools.groupby(enumerate(units), lambda q: (q[1] - q[0])): + group = list(group) + yield group[0][1], group[-1][1] - def read_categories(self, unicode_data_file): + def create_tables(self, units): """ - Read the corresponding unicode values and store them in category lists. - - :return: List of letters, non_letter and separators. + Split list of ranges into intervals and single char lists. + :return: A tuple containing the following info: + - list of interval starting points + - list of interval lengths + - list of single chars """ - range_start_id = 0 + interval_sps = [] + interval_lengths = [] + chars = [] + for element in self._group_ranges(units): + interval_length = element[1] - element[0] + if interval_length == 0: + chars.append(element[0]) + elif interval_length > self._length_limit: + for i in range(element[0], element[1], self._length_limit + 1): + length = min(self._length_limit, element[1] - i) + interval_sps.append(i) + interval_lengths.append(length) + else: + interval_sps.append(element[0]) + interval_lengths.append(interval_length) + + return interval_sps, interval_lengths, chars + + def read_units(self, file_path, categories, subcategories=None): + """ + Read the Unicode Derived Core Properties file and extract the ranges + for the given categories. + + :param file_path: Path to the Unicode "DerivedCoreProperties.txt" file. + :param categories: A list of category strings to extract from the Unicode file. + :param subcategories: A list of subcategory strings to restrict categories. + :return: A dictionary each string from the :param categories: is a key and for each + key list of code points are stored. + """ + # Create a dictionary in the format: { category[0]: [ ], ..., category[N]: [ ] } + units = {} + for category in categories: + units[category] = [] + + # Formats to match: + # ; # + # .. ; # + matcher = r"(?P[\dA-F]+)(?:\.\.(?P[\dA-F]+))?\s+; (?P[\w]+) # (?P[\w&]{2})" + + with open(file_path, "r") as src_file: + for line in src_file: + match = re.match(matcher, line) + + if (match + and match.group("category") in categories + and (not subcategories or match.group("subcategory") in subcategories)): + start = int(match.group("start"), 16) + # if no "end" found use the "start" + end = int(match.group("end") or match.group("start"), 16) + + matching_code_points = [ + code_point for code_point in range(start, end + 1) if self.in_range(code_point) + ] + + units[match.group("category")].extend(matching_code_points) + + return units + + def read_case_mappings(self, unicode_data_file, special_casing_file): + """ + Read the corresponding unicode values of lower and upper case letters and store these in tables. + + :param unicode_data_file: Contains the default case mappings (one-to-one mappings). + :param special_casing_file: Contains additional informative case mappings that are either not one-to-one + or which are context-sensitive. + :return: Upper and lower case mappings. + """ + + lower_case_mapping = {} + upper_case_mapping = {} + + # Add one-to-one mappings with open(unicode_data_file) as unicode_data: - for line in csv.reader(unicode_data, delimiter=';'): - unicode_id = int(line[0], 16) + reader = csv.reader(unicode_data, delimiter=';') - # Skip supplementary planes and ascii chars - if unicode_id >= 0x10000 or unicode_id < 128: + for line in reader: + letter_id = int(line[0], 16) + + if not self.in_range(letter_id): continue - category = line[2] + capital_letter = line[12] + small_letter = line[13] - if range_start_id != 0: - while range_start_id <= unicode_id: - self._store_by_category(range_start_id, category) - range_start_id += 1 - range_start_id = 0 + if capital_letter: + upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter) + + if small_letter: + lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter) + + # Update the conversion tables with the special cases + with open(special_casing_file) as special_casing: + reader = csv.reader(special_casing, delimiter=';') + + for line in reader: + # Skip comment sections and empty lines + if not line or line[0].startswith('#'): continue - if line[1].startswith('<'): - # Save the start position of the range - range_start_id = unicode_id + # Replace '#' character with empty string + for idx, fragment in enumerate(line): + if fragment.find('#') >= 0: + line[idx] = '' - self._store_by_category(unicode_id, category) + letter_id = int(line[0], 16) + condition_list = line[4] - # This separator char is handled separatly - separators = self._categories['separators'] - non_breaking_space = 0x00A0 - if non_breaking_space in separators: - separators.remove(int(non_breaking_space)) + if not self.in_range(letter_id) or condition_list: + continue - # These separator chars are not in the unicode data file or not in Zs category - mongolian_vowel_separator = 0x180E - medium_mathematical_space = 0x205F - zero_width_space = 0x200B + small_letter = parse_unicode_sequence(line[1]) + capital_letter = parse_unicode_sequence(line[3]) - if mongolian_vowel_separator not in separators: - bisect.insort(separators, int(mongolian_vowel_separator)) - if medium_mathematical_space not in separators: - bisect.insort(separators, int(medium_mathematical_space)) - if zero_width_space not in separators: - bisect.insort(separators, int(zero_width_space)) + lower_case_mapping[letter_id] = small_letter + upper_case_mapping[letter_id] = capital_letter - # https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters - non_letters = self._categories['non_letters'] - zero_width_non_joiner = 0x200C - zero_width_joiner = 0x200D + return lower_case_mapping, upper_case_mapping - bisect.insort(non_letters, int(zero_width_non_joiner)) - bisect.insort(non_letters, int(zero_width_joiner)) +class UnicodeSupplementaryCategorizer(UnicodeBasicCategorizer): + def __init__(self): + UnicodeBasicCategorizer.__init__(self) + self._length_limit = 0xffff + self.extra_id_continue_units = set() - return self._categories['letters'], self._categories['non_letters'], self._categories['separators'] + def in_range(self, i): + return i >= 0x10000 - -def group_ranges(i): - """ - Convert an increasing list of integers into a range list - - :return: List of ranges. - """ - for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): - group = list(group) - yield group[0][1], group[-1][1] - - -def split_list(category_list): - """ - Split list of ranges into intervals and single char lists. - - :return: List of interval starting points, interval lengths and single chars - """ - - interval_sps = [] - interval_lengths = [] - chars = [] - - for element in category_list: - interval_length = element[1] - element[0] - if interval_length == 0: - chars.append(element[0]) - elif interval_length > 255: - for i in range(element[0], element[1], 256): - length = 255 if (element[1] - i > 255) else (element[1] - i) - interval_sps.append(i) - interval_lengths.append(length) - else: - interval_sps.append(element[0]) - interval_lengths.append(element[1] - element[0]) - - return interval_sps, interval_lengths, chars - - -def generate_ranges(script_args): - categorizer = UnicodeCategorizer() - letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data) - - letter_tables = split_list(list(group_ranges(letters))) - non_letter_tables = split_list(list(group_ranges(non_letters))) - separator_tables = split_list(list(group_ranges(separators))) - - c_source = UniCodeSource(RANGES_C_SOURCE) +def generate_ranges(script_args, plane_type): + if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY: + c_source = UnicodeSupplementarySource(RANGES_SUP_C_SOURCE) + categorizer = UnicodeSupplementaryCategorizer() + else: + c_source = UnicodeBasicSource(RANGES_C_SOURCE) + categorizer = UnicodeBasicCategorizer() header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), - " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), + " * from %s. Do not edit! */" % os.path.basename(script_args.derived_core_properties), ""] c_source.complete_header("\n".join(header_completion)) - c_source.add_table(letter_tables[0], - "unicode_letter_interval_sps", - "uint16_t", - ("/**\n" - " * Character interval starting points for the unicode letters.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) + units = categorizer.read_units(script_args.derived_core_properties, ["ID_Start", "ID_Continue"]) - c_source.add_table(letter_tables[1], - "unicode_letter_interval_lengths", - "uint8_t", - ("/**\n" - " * Character lengths for the unicode letters.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) + units["ID_Continue"] = sorted(set(units["ID_Continue"]).union(categorizer.extra_id_continue_units) + - set(units["ID_Start"])) - c_source.add_table(letter_tables[2], - "unicode_letter_chars", - "uint16_t", - ("/**\n" - " * Those unicode letter characters that are not inside any of\n" - " * the intervals specified in lit_unicode_letter_interval_sps array.\n" - " *\n" - " * The characters are from the following Unicode categories:\n" - " * Lu, Ll, Lt, Lm, Lo, Nl\n" - " */")) + for category, unit in units.items(): + c_source.add_range(category, categorizer.create_tables(unit)) - c_source.add_table(non_letter_tables[0], - "unicode_non_letter_ident_part_interval_sps", - "uint16_t", - ("/**\n" - " * Character interval starting points for non-letter character\n" - " * that can be used as a non-first character of an identifier.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Nd, Mn, Mc, Pc\n" - " */")) + white_space_units = categorizer.read_units(script_args.prop_list, ["White_Space"], ["Zs"])["White_Space"] - c_source.add_table(non_letter_tables[1], - "unicode_non_letter_ident_part_interval_lengths", - "uint8_t", - ("/**\n" - " * Character interval lengths for non-letter character\n" - " * that can be used as a non-first character of an identifier.\n" - " *\n" - " * The characters covered by these intervals are from\n" - " * the following Unicode categories: Nd, Mn, Mc, Pc\n" - " */")) - - c_source.add_table(non_letter_tables[2], - "unicode_non_letter_ident_part_chars", - "uint16_t", - ("/**\n" - " * Those non-letter characters that can be used as a non-first\n" - " * character of an identifier and not included in any of the intervals\n" - " * specified in lit_unicode_non_letter_ident_part_interval_sps array.\n" - " *\n" - " * The characters are from the following Unicode categories:\n" - " * Nd, Mn, Mc, Pc\n" - " */")) - - c_source.add_table(separator_tables[0], - "unicode_separator_char_interval_sps", - "uint16_t", - ("/**\n" - " * Unicode separator character interval starting points from Unicode category: Zs\n" - " */")) - - c_source.add_table(separator_tables[1], - "unicode_separator_char_interval_lengths", - "uint8_t", - ("/**\n" - " * Unicode separator character interval lengths from Unicode category: Zs\n" - " */")) - - c_source.add_table(separator_tables[2], - "unicode_separator_chars", - "uint16_t", - ("/**\n" - " * Unicode separator characters that are not in the\n" - " * lit_unicode_separator_char_intervals array.\n" - " *\n" - " * Unicode category: Zs\n" - " */")) + c_source.add_whitepace_range("White_Space", categorizer, white_space_units) c_source.generate() @@ -320,70 +338,6 @@ def parse_unicode_sequence(raw_data): return result - -def read_case_mappings(unicode_data_file, special_casing_file): - """ - Read the corresponding unicode values of lower and upper case letters and store these in tables. - - :param unicode_data_file: Contains the default case mappings (one-to-one mappings). - :param special_casing_file: Contains additional informative case mappings that are either not one-to-one - or which are context-sensitive. - :return: Upper and lower case mappings. - """ - - lower_case_mapping = {} - upper_case_mapping = {} - - # Add one-to-one mappings - with open(unicode_data_file) as unicode_data: - unicode_data_reader = csv.reader(unicode_data, delimiter=';') - - for line in unicode_data_reader: - letter_id = int(line[0], 16) - - # Skip supplementary planes and ascii chars - if letter_id >= 0x10000 or letter_id < 128: - continue - - capital_letter = line[12] - small_letter = line[13] - - if capital_letter: - upper_case_mapping[letter_id] = parse_unicode_sequence(capital_letter) - - if small_letter: - lower_case_mapping[letter_id] = parse_unicode_sequence(small_letter) - - # Update the conversion tables with the special cases - with open(special_casing_file) as special_casing: - special_casing_reader = csv.reader(special_casing, delimiter=';') - - for line in special_casing_reader: - # Skip comment sections and empty lines - if not line or line[0].startswith('#'): - continue - - # Replace '#' character with empty string - for idx, i in enumerate(line): - if i.find('#') >= 0: - line[idx] = '' - - letter_id = int(line[0], 16) - condition_list = line[4] - - # Skip supplementary planes, ascii chars, and condition_list - if letter_id >= 0x10000 or letter_id < 128 or condition_list: - continue - - small_letter = parse_unicode_sequence(line[1]) - capital_letter = parse_unicode_sequence(line[3]) - - lower_case_mapping[letter_id] = small_letter - upper_case_mapping[letter_id] = capital_letter - - return lower_case_mapping, upper_case_mapping - - def extract_ranges(letter_case, reverse_letter_case=None): """ Extract ranges from case mappings @@ -675,27 +629,13 @@ def calculate_conversion_distance(letter_case, letter_id): return ord(letter_case[letter_id]) - letter_id -def generate_conversions(script_args): - # Read the corresponding unicode values of lower and upper case letters and store these in tables - case_mappings = read_case_mappings(script_args.unicode_data, script_args.special_casing) - lower_case = case_mappings[0] - upper_case = case_mappings[1] - - character_case_ranges = extract_ranges(lower_case, upper_case) - character_pair_ranges = extract_character_pair_ranges(lower_case, upper_case) - character_pairs = extract_character_pairs(lower_case, upper_case) - upper_case_special_ranges = extract_special_ranges(upper_case) - lower_case_ranges = extract_ranges(lower_case) - lower_case_conversions = extract_conversions(lower_case) - upper_case_conversions = extract_conversions(upper_case) - - if lower_case: - warnings.warn('Not all elements extracted from the lowercase table!') - if upper_case: - warnings.warn('Not all elements extracted from the uppercase table!') - - # Generate conversions output - c_source = UniCodeSource(CONVERSIONS_C_SOURCE) +def generate_conversions(script_args, plane_type): + if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY: + c_source = UnicodeSupplementarySource(CONVERSIONS_SUP_C_SOURCE) + categorizer = UnicodeSupplementaryCategorizer() + else: + c_source = UnicodeBasicSource(CONVERSIONS_C_SOURCE) + categorizer = UnicodeBasicCategorizer() unicode_file = os.path.basename(script_args.unicode_data) spec_casing_file = os.path.basename(script_args.special_casing) @@ -706,75 +646,58 @@ def generate_conversions(script_args): c_source.complete_header("\n".join(header_completion)) - c_source.add_table(character_case_ranges[0], - "character_case_ranges", - "uint16_t", - ("/* Contains start points of character case ranges " - "(these are bidirectional conversions). */")) + # Read the corresponding unicode values of lower and upper case letters and store these in tables + lower_case, upper_case = categorizer.read_case_mappings(script_args.unicode_data, script_args.special_casing) - c_source.add_table(character_case_ranges[1], - "character_case_range_lengths", - "uint8_t", - "/* Interval lengths of start points in `character_case_ranges` table. */") + c_source.add_conversion_range("character_case", + extract_ranges(lower_case, upper_case), + [("/* Contains start points of character case ranges " + "(these are bidirectional conversions). */"), + "/* Interval lengths of start points in `character_case_ranges` table. */"]) + c_source.add_conversion_range("character_pair", + extract_character_pair_ranges(lower_case, upper_case), + ["/* Contains the start points of bidirectional conversion ranges. */", + "/* Interval lengths of start points in `character_pair_ranges` table. */"]) - c_source.add_table(character_pair_ranges[0], - "character_pair_ranges", - "uint16_t", - "/* Contains the start points of bidirectional conversion ranges. */") - - c_source.add_table(character_pair_ranges[1], - "character_pair_range_lengths", - "uint8_t", - "/* Interval lengths of start points in `character_pair_ranges` table. */") - - c_source.add_table(character_pairs, + c_source.add_table(extract_character_pairs(lower_case, upper_case), + "/* Contains lower/upper case bidirectional conversion pairs. */", + c_source.character_type, "character_pairs", - "uint16_t", - "/* Contains lower/upper case bidirectional conversion pairs. */") + "") - c_source.add_table(upper_case_special_ranges[0], - "upper_case_special_ranges", - "uint16_t", - ("/* Contains start points of one-to-two uppercase ranges where the second character\n" - " * is always the same.\n" - " */")) + c_source.add_conversion_range("upper_case_special", + extract_special_ranges(upper_case), + [("/* Contains start points of one-to-two uppercase ranges where the " + "second character\n" + " * is always the same.\n" + " */"), + "/* Interval lengths for start points in `upper_case_special_ranges` table. */"]) - c_source.add_table(upper_case_special_ranges[1], - "upper_case_special_range_lengths", - "uint8_t", - "/* Interval lengths for start points in `upper_case_special_ranges` table. */") + c_source.add_conversion_range("lower_case", + extract_ranges(lower_case), + ["/* Contains start points of lowercase ranges. */", + "/* Interval lengths for start points in `lower_case_ranges` table. */"]) - c_source.add_table(lower_case_ranges[0], - "lower_case_ranges", - "uint16_t", - "/* Contains start points of lowercase ranges. */") + c_source.add_named_conversion_range("lower_case", + extract_conversions(lower_case), + ["conversions", "conversion_counters"], + [("/* The remaining lowercase conversions. The lowercase variant can " + "be one-to-three character long. */"), + ("/* Number of one-to-one, one-to-two, and one-to-three lowercase " + "conversions. */")]) - c_source.add_table(lower_case_ranges[1], - "lower_case_range_lengths", - "uint8_t", - "/* Interval lengths for start points in `lower_case_ranges` table. */") + c_source.add_named_conversion_range("upper_case", + extract_conversions(upper_case), + ["conversions", "conversion_counters"], + [("/* The remaining uppercase conversions. The uppercase variant can " + "be one-to-three character long. */"), + ("/* Number of one-to-one, one-to-two, and one-to-three uppercase " + "conversions. */")]) - c_source.add_table(lower_case_conversions[0], - "lower_case_conversions", - "uint16_t", - ("/* The remaining lowercase conversions. The lowercase variant can " - "be one-to-three character long. */")) - - c_source.add_table(lower_case_conversions[1], - "lower_case_conversion_counters", - "uint8_t", - "/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */") - - c_source.add_table(upper_case_conversions[0], - "upper_case_conversions", - "uint16_t", - ("/* The remaining uppercase conversions. The uppercase variant can " - "be one-to-three character long. */")) - - c_source.add_table(upper_case_conversions[1], - "upper_case_conversion_counters", - "uint8_t", - "/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */") + if lower_case: + warnings.warn('Not all elements extracted from the lowercase table!') + if upper_case: + warnings.warn('Not all elements extracted from the uppercase table!') c_source.generate() @@ -783,29 +706,37 @@ def generate_conversions(script_args): def main(): - parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}.inc.h generator', + parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}-{sup}.inc.h generator', epilog=''' - The input files (UnicodeData.txt, SpecialCasing.txt) + The input files: + - UnicodeData.txt + - SpecialCasing.txt + - DerivedCoreProperties.txt + - PropList.txt must be retrieved from http://www.unicode.org/Public//ucd/. The last known good version is 13.0.0. ''') + def check_file(path): + if not os.path.isfile(path) or not os.access(path, os.R_OK): + raise argparse.ArgumentTypeError('The %s file is missing or not readable!' % path) + return path parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True, - help='specify the unicode data file') + type=check_file, help='specify the unicode data file') parser.add_argument('--special-casing', metavar='FILE', action='store', required=True, - help='specify the special casing file') + type=check_file, help='specify the special casing file') + parser.add_argument('--prop-list', metavar='FILE', action='store', required=True, + type=check_file, help='specify the prop list file') + parser.add_argument('--derived-core-properties', metavar='FILE', action='store', required=True, + type=check_file, help='specify the DerivedCodeProperties file') script_args = parser.parse_args() - if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): - parser.error('The %s file is missing or not readable!' % script_args.unicode_data) - - if not os.path.isfile(script_args.special_casing) or not os.access(script_args.special_casing, os.R_OK): - parser.error('The %s file is missing or not readable!' % script_args.special_casing) - - generate_ranges(script_args) - generate_conversions(script_args) + generate_ranges(script_args, UNICODE_PLANE_TYPE_BASIC) + generate_ranges(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY) + generate_conversions(script_args, UNICODE_PLANE_TYPE_BASIC) + generate_conversions(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY) if __name__ == "__main__":