diff --git a/docs/02.API-REFERENCE.md b/docs/02.API-REFERENCE.md index 2e0494d22..c24d44908 100644 --- a/docs/02.API-REFERENCE.md +++ b/docs/02.API-REFERENCE.md @@ -1352,6 +1352,49 @@ jerry_string_to_char_buffer (const jerry_value_t value, - [jerry_create_string](#jerry_create_string) - [jerry_get_string_size](#jerry_get_string_size) +## jerry_string_to_utf8_char_buffer + +**Summary** + +Copy the characters of a string into a specified utf-8 buffer. +The '\0' character could occur in character buffer. Returns 0, +if the value parameter is not a string or the buffer isn't +large enough for the whole string. + +**Prototype** + +```c +jerry_size_t +jerry_string_to_utf8_char_buffer (const jerry_value_t value, + jerry_char_t *buffer_p, + jerry_size_t buffer_size); +``` + +- `value` - input string value +- `buffer_p` - pointer to output buffer +- `buffer_size` - size of the buffer +- return value - number of bytes, actually copied to the buffer + +**Example** + +```c +{ + jerry_value_t value; + ... // create or acquire value + + jerry_size_t req_sz = jerry_get_utf8_string_size (value); + jerry_char_t str_buf_p[req_sz]; + + jerry_string_to_utf8_char_buffer (value, str_buf_p, req_sz); + + jerry_release_value (value); +} +``` + +**See also** + +- [jerry_create_string_from_utf8](#jerry_create_string_from_utf8) +- [jerry_get_utf8_string_size](#jerry_get_utf8_string_size) # Functions for array object values diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 4c2d6de43..7c222a88c 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -858,10 +858,10 @@ ecma_string_get_array_index (const ecma_string_t *str_p) /**< ecma-string */ * @return number of bytes, actually copied to the buffer. */ lit_utf8_size_t __attr_return_value_should_be_checked___ -ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-string descriptor */ - lit_utf8_byte_t *buffer_p, /**< destination buffer pointer - * (can be NULL if buffer_size == 0) */ - lit_utf8_size_t buffer_size) /**< size of buffer */ +ecma_string_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p, /**< ecma-string descriptor */ + lit_utf8_byte_t *buffer_p, /**< destination buffer pointer + * (can be NULL if buffer_size == 0) */ + lit_utf8_size_t buffer_size) /**< size of buffer */ { JERRY_ASSERT (string_desc_p != NULL); JERRY_ASSERT (string_desc_p->refs_and_container >= ECMA_STRING_REF_ONE); @@ -908,6 +908,73 @@ ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-s } } + JERRY_ASSERT (size <= buffer_size); + return size; +} /* ecma_string_copy_to_cesu8_buffer */ + +/** + * Convert ecma-string's contents to an utf-8 string and put it to the buffer. + * It is the caller's responsibility to make sure that the string fits in the buffer. + * + * @return number of bytes, actually copied to the buffer. + */ +lit_utf8_size_t __attr_return_value_should_be_checked___ +ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-string descriptor */ + lit_utf8_byte_t *buffer_p, /**< destination buffer pointer + * (can be NULL if buffer_size == 0) */ + lit_utf8_size_t buffer_size) /**< size of buffer */ +{ + JERRY_ASSERT (string_desc_p != NULL); + JERRY_ASSERT (string_desc_p->refs_and_container >= ECMA_STRING_REF_ONE); + JERRY_ASSERT (buffer_p != NULL || buffer_size == 0); + JERRY_ASSERT (ecma_string_get_utf8_size (string_desc_p) <= buffer_size); + + lit_utf8_size_t size; + + switch (ECMA_STRING_GET_CONTAINER (string_desc_p)) + { + case ECMA_STRING_CONTAINER_HEAP_UTF8_STRING: + { + size = lit_convert_cesu8_string_to_utf8_string ((lit_utf8_byte_t *) (string_desc_p + 1), + string_desc_p->u.utf8_string.size, + buffer_p, + buffer_size); + break; + } + case ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING: + { + size = lit_convert_cesu8_string_to_utf8_string ((lit_utf8_byte_t *) (((ecma_long_string_t *) string_desc_p) + 1), + string_desc_p->u.long_utf8_string_size, + buffer_p, + buffer_size); + break; + } + case ECMA_STRING_CONTAINER_UINT32_IN_DESC: + { + const uint32_t uint32_number = string_desc_p->u.uint32_number; + size = ecma_uint32_to_utf8_string (uint32_number, buffer_p, buffer_size); + break; + } + case ECMA_STRING_CONTAINER_MAGIC_STRING: + { + const lit_magic_string_id_t id = string_desc_p->u.magic_string_id; + size = lit_get_magic_string_size (id); + memcpy (buffer_p, lit_get_magic_string_utf8 (id), size); + break; + } + default: + { + JERRY_ASSERT (ECMA_STRING_GET_CONTAINER (string_desc_p) == ECMA_STRING_CONTAINER_MAGIC_STRING_EX); + + const lit_magic_string_ex_id_t id = string_desc_p->u.magic_string_ex_id; + size = lit_convert_cesu8_string_to_utf8_string (lit_get_magic_string_ex_utf8 (id), + lit_get_magic_string_ex_size (id), + buffer_p, + buffer_size); + break; + } + } + JERRY_ASSERT (size <= buffer_size); return size; } /* ecma_string_copy_to_utf8_buffer */ @@ -923,7 +990,7 @@ ecma_string_to_utf8_bytes (const ecma_string_t *string_desc_p, /**< ecma-string * (can be NULL if buffer_size == 0) */ lit_utf8_size_t buffer_size) /**< size of buffer */ { - const lit_utf8_size_t size = ecma_string_copy_to_utf8_buffer (string_desc_p, buffer_p, buffer_size); + const lit_utf8_size_t size = ecma_string_copy_to_cesu8_buffer (string_desc_p, buffer_p, buffer_size); JERRY_ASSERT (size == buffer_size); } /* ecma_string_to_utf8_bytes */ diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index 0e434d791..1d70e1c21 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -178,7 +178,12 @@ ecma_number_t ecma_string_to_number (const ecma_string_t *str_p); uint32_t ecma_string_get_array_index (const ecma_string_t *str_p); lit_utf8_size_t __attr_return_value_should_be_checked___ -ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, lit_utf8_byte_t *buffer_p, +ecma_string_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p, + lit_utf8_byte_t *buffer_p, + lit_utf8_size_t buffer_size); +lit_utf8_size_t __attr_return_value_should_be_checked___ +ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, + lit_utf8_byte_t *buffer_p, lit_utf8_size_t buffer_size); void ecma_string_to_utf8_bytes (const ecma_string_t *string_desc_p, lit_utf8_byte_t *buffer_p, lit_utf8_size_t buffer_size); diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-error-prototype.c b/jerry-core/ecma/builtin-objects/ecma-builtin-error-prototype.c index 26a4df65f..c037bbff6 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-error-prototype.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-error-prototype.c @@ -141,7 +141,7 @@ ecma_builtin_error_prototype_object_to_string (ecma_value_t this_arg) /**< this JMEM_DEFINE_LOCAL_ARRAY (ret_str_buffer, size, lit_utf8_byte_t); lit_utf8_byte_t *ret_str_buffer_p = ret_str_buffer; - lit_utf8_size_t bytes = ecma_string_copy_to_utf8_buffer (name_string_p, ret_str_buffer_p, name_size); + lit_utf8_size_t bytes = ecma_string_copy_to_cesu8_buffer (name_string_p, ret_str_buffer_p, name_size); JERRY_ASSERT (bytes == name_size); ret_str_buffer_p = ret_str_buffer_p + bytes; JERRY_ASSERT (ret_str_buffer_p <= ret_str_buffer + size); @@ -156,7 +156,7 @@ ecma_builtin_error_prototype_object_to_string (ecma_value_t this_arg) /**< this space_size); JERRY_ASSERT (ret_str_buffer_p <= ret_str_buffer + size); - bytes = ecma_string_copy_to_utf8_buffer (msg_string_p, ret_str_buffer_p, msg_size); + bytes = ecma_string_copy_to_cesu8_buffer (msg_string_p, ret_str_buffer_p, msg_size); JERRY_ASSERT (bytes == msg_size); ret_str_buffer_p = ret_str_buffer_p + bytes; JERRY_ASSERT (ret_str_buffer_p == ret_str_buffer + size); diff --git a/jerry-core/jerry-api.h b/jerry-core/jerry-api.h index c1db337d2..2091f9d52 100644 --- a/jerry-core/jerry-api.h +++ b/jerry-core/jerry-api.h @@ -230,6 +230,9 @@ jerry_size_t jerry_get_utf8_string_size (const jerry_value_t value); jerry_length_t jerry_get_string_length (const jerry_value_t value); jerry_length_t jerry_get_utf8_string_length (const jerry_value_t value); jerry_size_t jerry_string_to_char_buffer (const jerry_value_t value, jerry_char_t *buffer_p, jerry_size_t buffer_size); +jerry_size_t jerry_string_to_utf8_char_buffer (const jerry_value_t value, + jerry_char_t *buffer_p, + jerry_size_t buffer_size); /** * Functions for array object values diff --git a/jerry-core/jerry.c b/jerry-core/jerry.c index 7ef905b73..078435e57 100644 --- a/jerry-core/jerry.c +++ b/jerry-core/jerry.c @@ -1160,10 +1160,44 @@ jerry_string_to_char_buffer (const jerry_value_t value, /**< input string value return 0; } + return ecma_string_copy_to_cesu8_buffer (str_p, + (lit_utf8_byte_t *) buffer_p, + buffer_size); +} /* jerry_string_to_char_buffer */ + +/** + * Copy the characters of an utf-8 encoded string into a specified buffer. + * + * Note: + * The '\0' character could occur anywhere in the returned string + * Returns 0, if the value parameter is not a string or the buffer + * is not large enough for the whole string. + * + * @return number of bytes copied to the buffer. + */ +jerry_size_t +jerry_string_to_utf8_char_buffer (const jerry_value_t value, /**< input string value */ + jerry_char_t *buffer_p, /**< [out] output characters buffer */ + jerry_size_t buffer_size) /**< size of output buffer */ +{ + jerry_assert_api_available (); + + if (!ecma_is_value_string (value) || buffer_p == NULL) + { + return 0; + } + + ecma_string_t *str_p = ecma_get_string_from_value (value); + + if (ecma_string_get_utf8_size (str_p) > buffer_size) + { + return 0; + } + return ecma_string_copy_to_utf8_buffer (str_p, (lit_utf8_byte_t *) buffer_p, buffer_size); -} /* jerry_string_to_char_buffer */ +} /* jerry_string_to_utf8_char_buffer */ /** * Checks whether the object or it's prototype objects have the given property. diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index 2ac2ba035..277286b7c 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -778,6 +778,61 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ } } /* lit_code_point_to_utf8 */ +/** + * Convert cesu-8 string to an utf-8 string and put it into the buffer. + * It is the caller's responsibility to make sure that the string fits in the buffer. + * + * @return number of bytes copied to the buffer. + */ +lit_utf8_size_t +lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, /**< cesu-8 string */ + lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */ + lit_utf8_byte_t *utf8_string, /**< destination utf-8 buffer pointer + * (can be NULL if buffer_size == 0) */ + lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */ +{ + const lit_utf8_byte_t *cesu8_pos = cesu8_string; + const lit_utf8_byte_t *cesu8_end_pos = cesu8_string + cesu8_size; + + lit_utf8_byte_t *utf8_pos = utf8_string; + lit_utf8_byte_t *utf8_end_pos = utf8_string + utf8_size; + + lit_utf8_size_t size = 0; + + ecma_char_t prev_ch = 0; + lit_utf8_size_t prev_ch_size = 0; + + while (cesu8_pos < cesu8_end_pos) + { + ecma_char_t ch; + lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch); + + if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) + { + JERRY_ASSERT (code_unit_size == prev_ch_size); + utf8_pos -= prev_ch_size; + lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_ch, ch); + lit_code_point_to_utf8 (code_point, utf8_pos); + size++; + } + else + { + memcpy (utf8_pos, cesu8_pos, code_unit_size); + size += code_unit_size; + } + + utf8_pos = utf8_string + size; + cesu8_pos += code_unit_size; + prev_ch = ch; + prev_ch_size = code_unit_size; + } + + JERRY_ASSERT (cesu8_pos == cesu8_end_pos); + JERRY_ASSERT (utf8_pos <= utf8_end_pos); + + return size; +} /* lit_convert_cesu8_string_to_utf8_string */ + /** * Convert surrogate pair to code point * diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index dcb2ba008..90f82de42 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -114,6 +114,10 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t fi lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t code_unit, lit_utf8_byte_t *buf_p); lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); lit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t code_point, lit_utf8_byte_t *buf); +lit_utf8_size_t lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string, + lit_utf8_size_t cesu8_size, + lit_utf8_byte_t *utf8_string, + lit_utf8_size_t utf8_size); lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, ecma_char_t low_surrogate); bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t string1_size, diff --git a/tests/unit/test-api.c b/tests/unit/test-api.c index 7f0bc98de..961e3269f 100644 --- a/tests/unit/test-api.c +++ b/tests/unit/test-api.c @@ -345,7 +345,7 @@ main (void) args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x90\x90\x80"); args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80"); - /* these size must be equal */ + /* These sizes must be equal */ utf8_sz = jerry_get_string_size (args[0]); cesu8_sz = jerry_get_string_size (args[1]); @@ -360,6 +360,26 @@ main (void) jerry_release_value (args[0]); jerry_release_value (args[1]); + /* Test jerry_string_to_utf8_char_buffer, test string: 'str: {DESERET CAPITAL LETTER LONG I}' */ + args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x90\x90\x80"); + args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80"); + + /* These sizes must be equal */ + utf8_sz = jerry_get_utf8_string_size (args[0]); + cesu8_sz = jerry_get_utf8_string_size (args[1]); + + TEST_ASSERT (utf8_sz == cesu8_sz); + + char string_from_utf8_string[utf8_sz]; + char string_from_cesu8_string[cesu8_sz]; + + jerry_string_to_utf8_char_buffer (args[0], (jerry_char_t *) string_from_utf8_string, utf8_sz); + jerry_string_to_utf8_char_buffer (args[1], (jerry_char_t *) string_from_cesu8_string, cesu8_sz); + + TEST_ASSERT (!strncmp (string_from_utf8, string_from_cesu8, utf8_sz)); + jerry_release_value (args[0]); + jerry_release_value (args[1]); + /* Test string: 'str: {MATHEMATICAL FRAKTUR SMALL F}{MATHEMATICAL FRAKTUR SMALL G}' */ args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4"); @@ -372,6 +392,12 @@ main (void) TEST_ASSERT (cesu8_length == 10 && utf8_length == 8); TEST_ASSERT (cesu8_sz != utf8_sz); TEST_ASSERT (utf8_sz == 14 && cesu8_sz == 18); + + char test_string[utf8_sz]; + + TEST_ASSERT (jerry_string_to_utf8_char_buffer (args[0], (jerry_char_t *) test_string, utf8_sz) == 14); + TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", utf8_sz)); + jerry_release_value (args[0]); /* Test string: 'str: {DESERET CAPITAL LETTER LONG I}' */