Change internal encoding of strings to CESU-8
JerryScript-DCO-1.0-Signed-off-by: Zsolt Borbély zsborbely.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai.u-szeged@partner.samsung.com
This commit is contained in:
+21
-37
@@ -30,18 +30,17 @@
|
||||
|
||||
typedef enum
|
||||
{
|
||||
UTF8_ANY_SIZE,
|
||||
UTF8_ONE_BYTE,
|
||||
UTF8_TWO_BYTES,
|
||||
UTF8_THREE_BYTES,
|
||||
UTF8_FOUR_BYTES
|
||||
CESU8_ANY_SIZE,
|
||||
CESU8_ONE_BYTE,
|
||||
CESU8_TWO_BYTES,
|
||||
CESU8_THREE_BYTES,
|
||||
} utf8_char_size;
|
||||
|
||||
static lit_utf8_size_t
|
||||
generate_utf8_char (utf8_char_size char_size,
|
||||
lit_utf8_byte_t *buf)
|
||||
generate_cesu8_char (utf8_char_size char_size,
|
||||
lit_utf8_byte_t *buf)
|
||||
{
|
||||
JERRY_ASSERT (char_size >= 0 && char_size <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||
JERRY_ASSERT (char_size >= 0 && char_size <= LIT_CESU8_MAX_BYTES_IN_CODE_UNIT);
|
||||
lit_code_point_t code_point = (lit_code_point_t) rand ();
|
||||
|
||||
if (char_size == 1)
|
||||
@@ -58,14 +57,9 @@ generate_utf8_char (utf8_char_size char_size,
|
||||
code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_3_BYTE_CODE_POINT_MAX -
|
||||
LIT_UTF8_3_BYTE_CODE_POINT_MIN);
|
||||
}
|
||||
else if (char_size == 4)
|
||||
{
|
||||
code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_4_BYTE_CODE_POINT_MAX -
|
||||
LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
}
|
||||
else
|
||||
{
|
||||
code_point %= LIT_UTF8_4_BYTE_CODE_POINT_MAX;
|
||||
code_point %= LIT_UTF8_3_BYTE_CODE_POINT_MAX;
|
||||
}
|
||||
|
||||
if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
|
||||
@@ -74,29 +68,29 @@ generate_utf8_char (utf8_char_size char_size,
|
||||
code_point = LIT_UTF16_HIGH_SURROGATE_MIN - 1;
|
||||
}
|
||||
|
||||
return lit_code_point_to_utf8 (code_point, buf);
|
||||
return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
|
||||
}
|
||||
|
||||
static ecma_length_t
|
||||
generate_utf8_string (lit_utf8_byte_t *buf_p,
|
||||
lit_utf8_size_t buf_size)
|
||||
generate_cesu8_string (lit_utf8_byte_t *buf_p,
|
||||
lit_utf8_size_t buf_size)
|
||||
{
|
||||
ecma_length_t length = 0;
|
||||
|
||||
lit_utf8_size_t size = 0;
|
||||
while (size < buf_size)
|
||||
{
|
||||
const utf8_char_size char_size = (((buf_size - size) > LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
|
||||
? UTF8_ANY_SIZE
|
||||
const utf8_char_size char_size = (((buf_size - size) > LIT_CESU8_MAX_BYTES_IN_CODE_UNIT)
|
||||
? CESU8_ANY_SIZE
|
||||
: (utf8_char_size) (buf_size - size));
|
||||
|
||||
lit_utf8_size_t bytes_generated = generate_utf8_char (char_size, buf_p);
|
||||
lit_utf8_size_t bytes_generated = generate_cesu8_char (char_size, buf_p);
|
||||
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (buf_p, bytes_generated));
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (buf_p, bytes_generated));
|
||||
|
||||
size += bytes_generated;
|
||||
buf_p += bytes_generated;
|
||||
length += (bytes_generated == LIT_UTF8_MAX_BYTES_IN_CODE_POINT) ? 2 : 1;
|
||||
length++;
|
||||
}
|
||||
|
||||
JERRY_ASSERT (size == buf_size);
|
||||
@@ -120,7 +114,7 @@ main (int __attr_unused___ argc,
|
||||
for (int i = 0; i < test_iters; i++)
|
||||
{
|
||||
lit_utf8_size_t utf8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string);
|
||||
ecma_length_t length = generate_utf8_string (utf8_string, utf8_string_size);
|
||||
ecma_length_t length = generate_cesu8_string (utf8_string, utf8_string_size);
|
||||
|
||||
JERRY_ASSERT (lit_utf8_string_length (utf8_string, utf8_string_size) == length);
|
||||
|
||||
@@ -183,29 +177,19 @@ main (int __attr_unused___ argc,
|
||||
|
||||
/* Overlong-encoded code point */
|
||||
lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
||||
JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
||||
|
||||
/* Overlong-encoded code point */
|
||||
lit_utf8_byte_t invalid_utf8_string_2[] = {0xE0, 0x80, 0x81};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_2, sizeof (invalid_utf8_string_2)));
|
||||
JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_utf8_string_2, sizeof (invalid_utf8_string_2)));
|
||||
|
||||
/* Pair of surrogates: 0xD901 0xDFF0 which encode Unicode character 0x507F0 */
|
||||
lit_utf8_byte_t invalid_utf8_string_3[] = {0xED, 0xA4, 0x81, 0xED, 0xBF, 0xB0};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_3, sizeof (invalid_utf8_string_3)));
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (invalid_utf8_string_3, sizeof (invalid_utf8_string_3)));
|
||||
|
||||
/* Isolated high surrogate 0xD901 */
|
||||
lit_utf8_byte_t valid_utf8_string_1[] = {0xED, 0xA4, 0x81};
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1)));
|
||||
|
||||
/* 4-byte long utf-8 character - Unicode character 0x507F0 */
|
||||
lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0};
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2)));
|
||||
|
||||
lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88};
|
||||
lit_code_point_t code_point;
|
||||
lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point);
|
||||
JERRY_ASSERT (bytes_count == 4);
|
||||
JERRY_ASSERT (code_point == 0x10348);
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1)));
|
||||
|
||||
lit_utf8_byte_t res_buf[3];
|
||||
lit_utf8_size_t res_size;
|
||||
|
||||
Reference in New Issue
Block a user