Support Unicode supplementary planes (#3928)
JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
This commit is contained in:
+342
-260
@@ -14,12 +14,15 @@
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "ecma-helpers.h"
|
||||
#include "lit-char-helpers.h"
|
||||
#include "lit-unicode-ranges.inc.h"
|
||||
#include "lit-unicode-ranges-sup.inc.h"
|
||||
#include "lit-strings.h"
|
||||
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
#include "lit-unicode-conversions.inc.h"
|
||||
#include "lit-unicode-conversions-sup.inc.h"
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
|
||||
#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
|
||||
@@ -31,36 +34,43 @@
|
||||
* @return true - if the character is in the given array
|
||||
* false - otherwise
|
||||
*/
|
||||
static bool
|
||||
search_char_in_char_array (ecma_char_t c, /**< code unit */
|
||||
const ecma_char_t *array, /**< array */
|
||||
int size_of_array) /**< length of the array */
|
||||
{
|
||||
int bottom = 0;
|
||||
int top = size_of_array - 1;
|
||||
#define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \
|
||||
static bool \
|
||||
function_name (char_type c, /**< code unit */ \
|
||||
const array_type *array, /**< array */ \
|
||||
int size_of_array) /**< length of the array */\
|
||||
{ \
|
||||
int bottom = 0; \
|
||||
int top = size_of_array - 1; \
|
||||
\
|
||||
while (bottom <= top) \
|
||||
{ \
|
||||
int middle = (bottom + top) / 2; \
|
||||
char_type current = array[middle]; \
|
||||
\
|
||||
if (current == c) \
|
||||
{ \
|
||||
return true; \
|
||||
} \
|
||||
\
|
||||
if (c < current) \
|
||||
{ \
|
||||
top = middle - 1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
bottom = middle + 1; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
return false; \
|
||||
} /* __function_name */
|
||||
|
||||
while (bottom <= top)
|
||||
{
|
||||
int middle = (bottom + top) / 2;
|
||||
ecma_char_t current = array[middle];
|
||||
LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t)
|
||||
|
||||
if (current == c)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (c < current)
|
||||
{
|
||||
top = middle - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
bottom = middle + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} /* search_char_in_char_array */
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t)
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
/**
|
||||
* Binary search algorithm that searches a character in the given intervals.
|
||||
@@ -70,37 +80,44 @@ search_char_in_char_array (ecma_char_t c, /**< code unit */
|
||||
* @return true - if the the character is included (inclusively) in one of the intervals in the given array
|
||||
* false - otherwise
|
||||
*/
|
||||
static bool
|
||||
search_char_in_interval_array (ecma_char_t c, /**< code unit */
|
||||
const ecma_char_t *array_sp, /**< array of interval starting points */
|
||||
const uint8_t *lengths, /**< array of interval lengths */
|
||||
int size_of_array) /**< length of the array */
|
||||
{
|
||||
int bottom = 0;
|
||||
int top = size_of_array - 1;
|
||||
#define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \
|
||||
static bool \
|
||||
function_name (char_type c, /**< code unit */ \
|
||||
const array_type *array_sp, /**< array of interval starting points */ \
|
||||
const interval_type *lengths, /**< array of interval lengths */ \
|
||||
int size_of_array) /**< length of the array */ \
|
||||
{ \
|
||||
int bottom = 0; \
|
||||
int top = size_of_array - 1; \
|
||||
\
|
||||
while (bottom <= top) \
|
||||
{ \
|
||||
int middle = (bottom + top) / 2; \
|
||||
char_type current_sp = array_sp[middle]; \
|
||||
\
|
||||
if (current_sp <= c && c <= current_sp + lengths[middle]) \
|
||||
{ \
|
||||
return true; \
|
||||
} \
|
||||
\
|
||||
if (c > current_sp) \
|
||||
{ \
|
||||
bottom = middle + 1; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
top = middle - 1; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
return false; \
|
||||
} /* function_name */
|
||||
|
||||
while (bottom <= top)
|
||||
{
|
||||
int middle = (bottom + top) / 2;
|
||||
ecma_char_t current_sp = array_sp[middle];
|
||||
LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t)
|
||||
|
||||
if (current_sp <= c && c <= current_sp + lengths[middle])
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (c > current_sp)
|
||||
{
|
||||
bottom = middle + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
top = middle - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
} /* search_char_in_interval_array */
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t)
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
/**
|
||||
* Check if specified character is one of the Whitespace characters including those that fall into
|
||||
@@ -116,20 +133,18 @@ lit_char_is_white_space (lit_code_point_t c) /**< code point */
|
||||
{
|
||||
return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return (c <= LIT_UTF16_CODE_UNIT_MAX
|
||||
&& ((c >= lit_unicode_separator_char_interval_sps[0]
|
||||
&& c < lit_unicode_separator_char_interval_sps[0] + lit_unicode_separator_char_interval_lengths[0])
|
||||
|| search_char_in_char_array ((ecma_char_t) c,
|
||||
lit_unicode_separator_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_separator_chars))));
|
||||
if (c == LIT_CHAR_NBSP || c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return (c <= LIT_UTF16_CODE_UNIT_MAX
|
||||
&& ((c >= lit_unicode_white_space_interval_starts[0]
|
||||
&& c < lit_unicode_white_space_interval_starts[0] + lit_unicode_white_space_interval_lengths[0])
|
||||
|| lit_search_char_in_array ((ecma_char_t) c,
|
||||
lit_unicode_white_space_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_white_space_chars))));
|
||||
} /* lit_char_is_white_space */
|
||||
|
||||
/**
|
||||
@@ -148,58 +163,84 @@ lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
|
||||
} /* lit_char_is_line_terminator */
|
||||
|
||||
/**
|
||||
* Check if specified character is a unicode letter
|
||||
*
|
||||
* Note:
|
||||
* Unicode letter is a character, included into one of the following categories:
|
||||
* - Uppercase letter (Lu);
|
||||
* - Lowercase letter (Ll);
|
||||
* - Titlecase letter (Lt);
|
||||
* - Modifier letter (Lm);
|
||||
* - Other letter (Lo);
|
||||
* - Letter number (Nl).
|
||||
* Check if specified character is a Unicode ID_Start
|
||||
*
|
||||
* See also:
|
||||
* ECMA-262 v5, 7.6
|
||||
* ECMA-262 v1, 11.6: UnicodeIDStart
|
||||
*
|
||||
* @return true - if specified character falls into one of the listed categories,
|
||||
* @return true - if the codepoint has Unicode property "ID_Start"
|
||||
* false - otherwise
|
||||
*/
|
||||
static bool
|
||||
lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
|
||||
lit_char_is_unicode_id_start (lit_code_point_t code_point) /**< code unit */
|
||||
{
|
||||
return (search_char_in_interval_array (c,
|
||||
lit_unicode_letter_interval_sps,
|
||||
lit_unicode_letter_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_letter_interval_sps))
|
||||
|| search_char_in_char_array (c, lit_unicode_letter_chars, NUM_OF_ELEMENTS (lit_unicode_letter_chars)));
|
||||
} /* lit_char_is_unicode_letter */
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
|
||||
{
|
||||
return (lit_search_codepoint_in_interval_array (code_point,
|
||||
lit_unicode_id_start_interval_starts_sup,
|
||||
lit_unicode_id_start_interval_lengths_sup,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup))
|
||||
|| lit_search_codepoint_in_array (code_point,
|
||||
lit_unicode_id_start_chars_sup,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup)));
|
||||
}
|
||||
#else /* !ENABLED (JERRY_ESNEXT) */
|
||||
JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
ecma_char_t c = (ecma_char_t) code_point;
|
||||
|
||||
return (lit_search_char_in_interval_array (c,
|
||||
lit_unicode_id_start_interval_starts,
|
||||
lit_unicode_id_start_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts))
|
||||
|| lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars)));
|
||||
} /* lit_char_is_unicode_id_start */
|
||||
|
||||
/**
|
||||
* Check if specified character is a non-letter character and can be used as a
|
||||
* non-first character of an identifier.
|
||||
* These characters coverd by the following unicode categories:
|
||||
* - digit (Nd)
|
||||
* - punctuation mark (Mn, Mc)
|
||||
* - connector punctuation (Pc)
|
||||
* Check if specified character is a Unicode ID_Continue
|
||||
*
|
||||
* See also:
|
||||
* ECMA-262 v5, 7.6
|
||||
* ECMA-262 v1, 11.6: UnicodeIDContinue
|
||||
*
|
||||
* @return true - if specified character falls into one of the listed categories,
|
||||
* @return true - if the codepoint has Unicode property "ID_Continue"
|
||||
* false - otherwise
|
||||
*/
|
||||
static bool
|
||||
lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
|
||||
lit_char_is_unicode_id_continue (lit_code_point_t code_point) /**< code unit */
|
||||
{
|
||||
return (search_char_in_interval_array (c,
|
||||
lit_unicode_non_letter_ident_part_interval_sps,
|
||||
lit_unicode_non_letter_ident_part_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_interval_sps))
|
||||
|| search_char_in_char_array (c,
|
||||
lit_unicode_non_letter_ident_part_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars)));
|
||||
} /* lit_char_is_unicode_non_letter_ident_part */
|
||||
/* Each ID_Start codepoint is ID_Continue as well. */
|
||||
if (lit_char_is_unicode_id_start (code_point))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
|
||||
{
|
||||
return (lit_search_codepoint_in_interval_array (code_point,
|
||||
lit_unicode_id_continue_interval_starts_sup,
|
||||
lit_unicode_id_continue_interval_lengths_sup,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup))
|
||||
|| lit_search_codepoint_in_array (code_point,
|
||||
lit_unicode_id_continue_chars_sup,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup)));
|
||||
}
|
||||
#else /* !ENABLED (JERRY_ESNEXT) */
|
||||
JERRY_ASSERT (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
ecma_char_t c = (ecma_char_t) code_point;
|
||||
|
||||
return (lit_search_char_in_interval_array (c,
|
||||
lit_unicode_id_continue_interval_starts,
|
||||
lit_unicode_id_continue_interval_lengths,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts))
|
||||
|| lit_search_char_in_array (c,
|
||||
lit_unicode_id_continue_chars,
|
||||
NUM_OF_ELEMENTS (lit_unicode_id_continue_chars)));
|
||||
} /* lit_char_is_unicode_id_continue */
|
||||
|
||||
/**
|
||||
* Checks whether the character is a valid identifier start.
|
||||
@@ -218,17 +259,7 @@ lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point
|
||||
|| code_point == LIT_CHAR_UNDERSCORE);
|
||||
}
|
||||
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
|
||||
{
|
||||
/* TODO: detect these ranges correctly. */
|
||||
return (code_point >= 0x10C80 && code_point <= 0x10CF2);
|
||||
}
|
||||
#else /* !ENABLED (JERRY_ESNEXT) */
|
||||
JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
return lit_char_is_unicode_letter ((ecma_char_t) code_point);
|
||||
return lit_char_is_unicode_id_start (code_point);
|
||||
} /* lit_code_point_is_identifier_start */
|
||||
|
||||
/**
|
||||
@@ -249,18 +280,7 @@ lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point
|
||||
|| code_point == LIT_CHAR_UNDERSCORE);
|
||||
}
|
||||
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
|
||||
{
|
||||
/* TODO: detect these ranges correctly. */
|
||||
return (code_point >= 0x10C80 && code_point <= 0x10CF2);
|
||||
}
|
||||
#else /* !ENABLED (JERRY_ESNEXT) */
|
||||
JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
return (lit_char_is_unicode_letter ((ecma_char_t) code_point)
|
||||
|| lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point));
|
||||
return lit_char_is_unicode_id_continue (code_point);
|
||||
} /* lit_code_point_is_identifier_part */
|
||||
|
||||
/**
|
||||
@@ -519,16 +539,27 @@ lit_char_is_word_char (lit_code_point_t c) /**< code point */
|
||||
/**
|
||||
* Check if the specified character is in one of those tables which contain bidirectional conversions.
|
||||
*
|
||||
* @return the mapped character sequence of an ecma character, if it's in the table.
|
||||
* 0 - otherwise.
|
||||
* @return codepoint of the converted character if it is found the the tables
|
||||
* LIT_INVALID_CP - otherwise.
|
||||
*/
|
||||
static ecma_length_t
|
||||
search_in_bidirectional_conversion_tables (ecma_char_t character, /**< code unit */
|
||||
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
||||
bool is_lowercase) /**< is lowercase conversion */
|
||||
static lit_code_point_t
|
||||
lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /**< code point */
|
||||
bool is_lowercase) /**< is lowercase conversion */
|
||||
{
|
||||
/* 1, Check if the specified character is part of the lit_character_case_ranges table. */
|
||||
int number_of_case_ranges = NUM_OF_ELEMENTS (lit_character_case_ranges);
|
||||
/* 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. */
|
||||
int number_of_case_ranges;
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX;
|
||||
if (is_supplementary)
|
||||
{
|
||||
number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup);
|
||||
}
|
||||
else
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
{
|
||||
number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges);
|
||||
}
|
||||
|
||||
int conv_counter = 0;
|
||||
|
||||
for (int i = 0; i < number_of_case_ranges; i++)
|
||||
@@ -538,54 +569,92 @@ search_in_bidirectional_conversion_tables (ecma_char_t character, /**< co
|
||||
conv_counter++;
|
||||
}
|
||||
|
||||
int range_length = lit_character_case_range_lengths[conv_counter];
|
||||
ecma_char_t start_point = lit_character_case_ranges[i];
|
||||
size_t range_length;
|
||||
lit_code_point_t start_point;
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (is_supplementary)
|
||||
{
|
||||
range_length = lit_unicode_character_case_range_lengths_sup[conv_counter];
|
||||
start_point = lit_unicode_character_case_ranges_sup[i];
|
||||
}
|
||||
else
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
{
|
||||
range_length = lit_unicode_character_case_range_lengths[conv_counter];
|
||||
start_point = lit_unicode_character_case_ranges[i];
|
||||
}
|
||||
|
||||
if (start_point > character || character >= start_point + range_length)
|
||||
if (start_point > cp || cp >= start_point + range_length)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
int char_dist = character - start_point;
|
||||
|
||||
uint32_t char_dist = (uint32_t) cp - start_point;
|
||||
int offset;
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? (ecma_char_t) (lit_character_case_ranges[i + 1] + char_dist) : character;
|
||||
if (!is_lowercase)
|
||||
{
|
||||
return cp;
|
||||
}
|
||||
|
||||
offset = i + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (lit_character_case_ranges[i - 1] + char_dist);
|
||||
if (is_lowercase)
|
||||
{
|
||||
return cp;
|
||||
}
|
||||
|
||||
offset = i - 1;
|
||||
}
|
||||
|
||||
return 1;
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (is_supplementary)
|
||||
{
|
||||
start_point = lit_unicode_character_case_ranges_sup[offset];
|
||||
}
|
||||
else
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
{
|
||||
start_point = lit_unicode_character_case_ranges[offset];
|
||||
}
|
||||
|
||||
return (lit_code_point_t) (start_point + char_dist);
|
||||
}
|
||||
|
||||
/* Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are
|
||||
defined for supplementary planes */
|
||||
#if ENABLED (JERRY_ESNEXT)
|
||||
if (is_supplementary)
|
||||
{
|
||||
return cp;
|
||||
}
|
||||
#endif /* ENABLED (JERRY_ESNEXT) */
|
||||
|
||||
/* 2, Check if the specified character is part of the character_pair_ranges table. */
|
||||
int bottom = 0;
|
||||
int top = NUM_OF_ELEMENTS (lit_character_pair_ranges) - 1;
|
||||
int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - 1;
|
||||
|
||||
while (bottom <= top)
|
||||
{
|
||||
int middle = (bottom + top) / 2;
|
||||
ecma_char_t current_sp = lit_character_pair_ranges[middle];
|
||||
lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle];
|
||||
|
||||
if (current_sp <= character && character < current_sp + lit_character_pair_range_lengths[middle])
|
||||
if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle])
|
||||
{
|
||||
int char_dist = character - current_sp;
|
||||
uint32_t char_dist = (uint32_t) (cp - current_sp);
|
||||
|
||||
if ((character - current_sp) % 2 == 0)
|
||||
if ((cp - current_sp) % 2 == 0)
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? (ecma_char_t) (current_sp + char_dist + 1) : character;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (current_sp + char_dist - 1);
|
||||
return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + 1) : cp;
|
||||
}
|
||||
|
||||
return 1;
|
||||
return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - 1);
|
||||
}
|
||||
|
||||
if (character > current_sp)
|
||||
if (cp > current_sp)
|
||||
{
|
||||
bottom = middle + 1;
|
||||
}
|
||||
@@ -596,41 +665,37 @@ search_in_bidirectional_conversion_tables (ecma_char_t character, /**< co
|
||||
}
|
||||
|
||||
/* 3, Check if the specified character is part of the character_pairs table. */
|
||||
int number_of_character_pairs = NUM_OF_ELEMENTS (lit_character_pairs);
|
||||
int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs);
|
||||
|
||||
for (int i = 0; i < number_of_character_pairs; i++)
|
||||
{
|
||||
if (character != lit_character_pairs[i])
|
||||
if (cp != lit_unicode_character_pairs[i])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? lit_character_pairs[i + 1] : character;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_buffer_p[0] = is_lowercase ? character : lit_character_pairs[i - 1];
|
||||
return is_lowercase ? lit_unicode_character_pairs[i + 1] : cp;
|
||||
}
|
||||
|
||||
return 1;
|
||||
return is_lowercase ? cp : lit_unicode_character_pairs[i - 1];
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* search_in_bidirectional_conversion_tables */
|
||||
return LIT_INVALID_CP;
|
||||
} /* lit_search_in_bidirectional_conversion_tables */
|
||||
|
||||
/**
|
||||
* Check if the specified character is in the given conversion table.
|
||||
*
|
||||
* @return the mapped character sequence of an ecma character, if it's in the table.
|
||||
* 0 - otherwise.
|
||||
* @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit
|
||||
* converted code point - otherwise
|
||||
*/
|
||||
static ecma_length_t
|
||||
search_in_conversion_table (ecma_char_t character, /**< code unit */
|
||||
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
||||
const ecma_char_t *array, /**< array */
|
||||
const uint8_t *counters) /**< case_values counter */
|
||||
static lit_code_point_t
|
||||
lit_search_in_conversion_table (ecma_char_t character, /**< code unit */
|
||||
ecma_stringbuilder_t *builder_p, /**< string builder */
|
||||
const ecma_char_t *array, /**< array */
|
||||
const uint8_t *counters) /**< case_values counter */
|
||||
{
|
||||
int end_point = 0;
|
||||
|
||||
@@ -653,28 +718,21 @@ search_in_conversion_table (ecma_char_t character, /**< code unit */
|
||||
|
||||
if (current == character)
|
||||
{
|
||||
ecma_length_t char_sequence = 1;
|
||||
|
||||
switch (size_of_case_value)
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
case 3:
|
||||
ecma_stringbuilder_append_char (builder_p, array[middle + 1]);
|
||||
|
||||
if (size_of_case_value > 1)
|
||||
{
|
||||
output_buffer_p[2] = array[middle + 3];
|
||||
char_sequence++;
|
||||
/* FALLTHRU */
|
||||
ecma_stringbuilder_append_char (builder_p, array[middle + 2]);
|
||||
}
|
||||
case 2:
|
||||
if (size_of_case_value > 2)
|
||||
{
|
||||
output_buffer_p[1] = array[middle + 2];
|
||||
char_sequence++;
|
||||
/* FALLTHRU */
|
||||
}
|
||||
default:
|
||||
{
|
||||
output_buffer_p[0] = array[middle + 1];
|
||||
return char_sequence;
|
||||
ecma_stringbuilder_append_char (builder_p, array[middle + 3]);
|
||||
}
|
||||
}
|
||||
|
||||
return size_of_case_value == 1 ? array[middle + 1]: LIT_MULTIPLE_CU;
|
||||
}
|
||||
|
||||
if (character < current)
|
||||
@@ -688,127 +746,151 @@ search_in_conversion_table (ecma_char_t character, /**< code unit */
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
} /* search_in_conversion_table */
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_char (builder_p, character);
|
||||
}
|
||||
|
||||
return (lit_code_point_t) character;
|
||||
} /* lit_search_in_conversion_table */
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
|
||||
/**
|
||||
* Returns the lowercase character sequence of an ecma character.
|
||||
* Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
|
||||
*
|
||||
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
|
||||
*
|
||||
* @return the length of the lowercase character sequence
|
||||
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
|
||||
* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
|
||||
* converted code point - otherwise
|
||||
*/
|
||||
ecma_length_t
|
||||
lit_char_to_lower_case (ecma_char_t character, /**< input character value */
|
||||
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
||||
ecma_length_t buffer_size) /**< buffer size */
|
||||
lit_code_point_t
|
||||
lit_char_to_lower_case (lit_code_point_t cp, /**< code point */
|
||||
ecma_stringbuilder_t *builder_p) /**< string builder */
|
||||
{
|
||||
JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
|
||||
|
||||
if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
|
||||
if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z)
|
||||
{
|
||||
output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
||||
return 1;
|
||||
lit_utf8_byte_t lowercase_char = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
||||
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_byte (builder_p, lowercase_char);
|
||||
}
|
||||
|
||||
return lowercase_char;
|
||||
}
|
||||
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true);
|
||||
|
||||
ecma_length_t lowercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, true);
|
||||
|
||||
if (lowercase_sequence != 0)
|
||||
if (lowercase_cp != LIT_INVALID_CP)
|
||||
{
|
||||
return lowercase_sequence;
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
|
||||
}
|
||||
|
||||
return lowercase_cp;
|
||||
}
|
||||
|
||||
int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_lower_case_ranges);
|
||||
JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
|
||||
int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges);
|
||||
|
||||
for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
|
||||
{
|
||||
int range_length = lit_lower_case_range_lengths[j] - 1;
|
||||
ecma_char_t start_point = lit_lower_case_ranges[i];
|
||||
JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > 0);
|
||||
uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - 1);
|
||||
lit_code_point_t start_point = lit_unicode_lower_case_ranges[i];
|
||||
|
||||
if (start_point <= character && character <= start_point + range_length)
|
||||
if (start_point <= cp && cp <= start_point + range_length)
|
||||
{
|
||||
output_buffer_p[0] = (ecma_char_t) (lit_lower_case_ranges[i + 1] + (character - start_point));
|
||||
return 1;
|
||||
lowercase_cp = lit_unicode_lower_case_ranges[i + 1] + (cp - start_point);
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
|
||||
}
|
||||
|
||||
return lowercase_cp;
|
||||
}
|
||||
}
|
||||
|
||||
lowercase_sequence = search_in_conversion_table (character,
|
||||
output_buffer_p,
|
||||
lit_lower_case_conversions,
|
||||
lit_lower_case_conversion_counters);
|
||||
|
||||
if (lowercase_sequence != 0)
|
||||
return lit_search_in_conversion_table ((ecma_char_t) cp,
|
||||
builder_p,
|
||||
lit_unicode_lower_case_conversions,
|
||||
lit_unicode_lower_case_conversion_counters);
|
||||
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
return lowercase_sequence;
|
||||
ecma_stringbuilder_append_codepoint (builder_p, cp);
|
||||
}
|
||||
|
||||
return cp;
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
|
||||
output_buffer_p[0] = character;
|
||||
return 1;
|
||||
} /* lit_char_to_lower_case */
|
||||
|
||||
/**
|
||||
* Returns the uppercase character sequence of an ecma character.
|
||||
* Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
|
||||
*
|
||||
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
|
||||
*
|
||||
* @return the length of the uppercase character sequence
|
||||
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
|
||||
* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
|
||||
* converted code point - otherwise
|
||||
*/
|
||||
ecma_length_t
|
||||
lit_char_to_upper_case (ecma_char_t character, /**< input character value */
|
||||
ecma_char_t *output_buffer_p, /**< buffer for the result characters */
|
||||
ecma_length_t buffer_size) /**< buffer size */
|
||||
lit_code_point_t
|
||||
lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
|
||||
ecma_stringbuilder_t *builder_p) /**< string builder */
|
||||
{
|
||||
JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
|
||||
|
||||
if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
|
||||
if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z)
|
||||
{
|
||||
output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
||||
return 1;
|
||||
lit_utf8_byte_t uppercase_char = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
||||
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_byte (builder_p, uppercase_char);
|
||||
}
|
||||
|
||||
return uppercase_char;
|
||||
}
|
||||
|
||||
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
|
||||
lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false);
|
||||
|
||||
ecma_length_t uppercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, false);
|
||||
|
||||
if (uppercase_sequence != 0)
|
||||
if (uppercase_cp != LIT_INVALID_CP)
|
||||
{
|
||||
return uppercase_sequence;
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
|
||||
}
|
||||
|
||||
return uppercase_cp;
|
||||
}
|
||||
|
||||
int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_upper_case_special_ranges);
|
||||
int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges);
|
||||
|
||||
for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
|
||||
{
|
||||
int range_length = lit_upper_case_special_range_lengths[j];
|
||||
ecma_char_t start_point = lit_upper_case_special_ranges[i];
|
||||
uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j];
|
||||
ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i];
|
||||
|
||||
if (start_point <= character && character <= start_point + range_length)
|
||||
if (start_point <= cp && cp <= start_point + range_length)
|
||||
{
|
||||
output_buffer_p[0] = (ecma_char_t) (lit_upper_case_special_ranges[i + 1] + (character - start_point));
|
||||
output_buffer_p[1] = (ecma_char_t) (lit_upper_case_special_ranges[i + 2]);
|
||||
return 2;
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
uppercase_cp = lit_unicode_upper_case_special_ranges[i + 1] + (cp - start_point);
|
||||
ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
|
||||
ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + 2]);
|
||||
}
|
||||
|
||||
return LIT_MULTIPLE_CU;
|
||||
}
|
||||
}
|
||||
|
||||
uppercase_sequence = search_in_conversion_table (character,
|
||||
output_buffer_p,
|
||||
lit_upper_case_conversions,
|
||||
lit_upper_case_conversion_counters);
|
||||
|
||||
if (uppercase_sequence != 0)
|
||||
return lit_search_in_conversion_table ((ecma_char_t) cp,
|
||||
builder_p,
|
||||
lit_unicode_upper_case_conversions,
|
||||
lit_unicode_upper_case_conversion_counters);
|
||||
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
if (builder_p != NULL)
|
||||
{
|
||||
return uppercase_sequence;
|
||||
ecma_stringbuilder_append_codepoint (builder_p, cp);
|
||||
}
|
||||
|
||||
return cp;
|
||||
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
|
||||
|
||||
output_buffer_p[0] = character;
|
||||
return 1;
|
||||
} /* lit_char_to_upper_case */
|
||||
|
||||
Reference in New Issue
Block a user