diff --git a/docs/02.API-REFERENCE.md b/docs/02.API-REFERENCE.md index 768cf4b49..7620349b2 100644 --- a/docs/02.API-REFERENCE.md +++ b/docs/02.API-REFERENCE.md @@ -1406,7 +1406,7 @@ jerry_string_to_utf8_char_buffer (const jerry_value_t value, Copy the characters of a cesu-8 encoded substring into a specified buffer. The '\0' character could occur in character buffer. Returns 0, if the value -parameter is not a string. It will extract the substring beetween the +parameter is not a string. It will extract the substring between the specified start position and the end position (or the end of the string, whichever comes first). @@ -1452,6 +1452,57 @@ jerry_substring_to_char_buffer (const jerry_value_t value, - [jerry_get_string_size](#jerry_get_string_size) - [jerry_get_string_length](#jerry_get_string_length) +## jerry_substring_to_utf8_char_buffer + +**Summary** + +Copy the characters of an utf-8 encoded substring into a specified buffer. +The '\0' character could occur in character buffer. Returns 0, if the value +parameter is not a string. It will extract the substring between the specified +start position and the end position (or the end of the string, whichever +comes first). + +**Prototype** + +```c +jerry_size_t +jerry_substring_to_utf8_char_buffer (const jerry_value_t value, + jerry_length_t start_pos, + jerry_length_t end_pos, + jerry_char_t *buffer_p, + jerry_size_t buffer_size); +``` + +- `value` - input string value +- `start_pos` - position of the first character +- `end_pos` - position of the last character +- `buffer_p` - pointer to output buffer +- `buffer_size` - size of the buffer +- return value - number of bytes, actually copied to the buffer + +**Example** + +```c +{ + jerry_value_t value; + ... // create or acquire value + + jerry_size_t req_sz = jerry_get_utf8_string_size (value); + jerry_char_t str_buf_p[req_sz]; + jerry_length_t start_pos = 0; + jerry_length_t end_pos = jerry_get_utf8_string_length (value); + + jerry_substring_to_utf8_char_buffer (value, start_pos, end_pos, str_buf_p, req_sz); + + jerry_release_value (value); +} +``` + +**See also** + +- [jerry_create_string_from_utf8](#jerry_create_string) +- [jerry_get_utf8_string_size](#jerry_get_utf8_string_size) +- [jerry_get_utf8_string_length](#jerry_get_utf8_string_length) # Functions for array object values ## jerry_get_array_length diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 6bae6d6b7..09992faf9 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -980,7 +980,7 @@ ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-s } /* ecma_string_copy_to_utf8_buffer */ /** - * Convert ecma-string's contents to a cesu-8 string, extract the parts of the converted string beetween the specified + * Convert ecma-string's contents to a cesu-8 string, extract the parts of the converted string between the specified * start position and the end position (or the end of the string, whichever comes first), and copy these characters * into the buffer. * @@ -1059,6 +1059,137 @@ ecma_substring_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p, /**< ec return size; } /* ecma_substring_copy_to_cesu8_buffer */ +/** + * Convert ecma-string's contents to an utf-8 string, extract the parts of the converted string between the specified + * start position and the end position (or the end of the string, whichever comes first), and copy these characters + * into the buffer. + * + * @return number of bytes, actually copied to the buffer. + */ +lit_utf8_size_t +ecma_substring_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-string descriptor */ + ecma_length_t start_pos, /**< position of the first character */ + ecma_length_t end_pos, /**< position of the last character */ + lit_utf8_byte_t *buffer_p, /**< destination buffer pointer + * (can be NULL if buffer_size == 0) */ + lit_utf8_size_t buffer_size) /**< size of buffer */ +{ + JERRY_ASSERT (string_desc_p != NULL); + JERRY_ASSERT (string_desc_p->refs_and_container >= ECMA_STRING_REF_ONE); + JERRY_ASSERT (buffer_p != NULL || buffer_size == 0); + + lit_utf8_size_t size = 0; + + ecma_length_t utf8_str_length = ecma_string_get_utf8_length (string_desc_p); + + if (start_pos >= utf8_str_length || start_pos >= end_pos) + { + return 0; + } + + if (end_pos > utf8_str_length) + { + end_pos = utf8_str_length; + } + + ECMA_STRING_TO_UTF8_STRING (string_desc_p, cesu8_str_p, cesu8_str_size); + ecma_length_t cesu8_str_length = ecma_string_get_length (string_desc_p); + + if (cesu8_str_length == cesu8_str_size) + { + cesu8_str_p += start_pos; + size = end_pos - start_pos; + + if (size > buffer_size) + { + size = buffer_size; + } + + memcpy (buffer_p, cesu8_str_p, size); + } + else + { + const lit_utf8_byte_t *cesu8_end_pos = cesu8_str_p + cesu8_str_size; + end_pos -= start_pos; + + while (start_pos--) + { + ecma_char_t ch; + lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_str_p, &ch); + + cesu8_str_p += code_unit_size; + if ((cesu8_str_p != cesu8_end_pos) && lit_is_code_point_utf16_high_surrogate (ch)) + { + ecma_char_t next_ch; + lit_utf8_size_t next_ch_size = lit_read_code_unit_from_utf8 (cesu8_str_p, &next_ch); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + JERRY_ASSERT (code_unit_size == next_ch_size); + cesu8_str_p += code_unit_size; + } + } + } + + const lit_utf8_byte_t *cesu8_pos = cesu8_str_p; + + lit_utf8_byte_t *utf8_pos = buffer_p; + lit_utf8_byte_t *utf8_end_pos = buffer_p + buffer_size; + + while (end_pos--) + { + ecma_char_t ch; + lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch); + + if ((size + code_unit_size) > buffer_size) + { + break; + } + + if (((cesu8_pos + code_unit_size) != cesu8_end_pos) && lit_is_code_point_utf16_high_surrogate (ch)) + { + ecma_char_t next_ch; + lit_utf8_size_t next_ch_size = lit_read_code_unit_from_utf8 (cesu8_pos + code_unit_size, &next_ch); + + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + JERRY_ASSERT (code_unit_size == next_ch_size); + + if ((size + code_unit_size + 1) > buffer_size) + { + break; + } + + cesu8_pos += next_ch_size; + + lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (ch, next_ch); + lit_code_point_to_utf8 (code_point, utf8_pos); + size += (code_unit_size + 1); + } + else + { + memcpy (utf8_pos, cesu8_pos, code_unit_size); + size += code_unit_size; + } + } + else + { + memcpy (utf8_pos, cesu8_pos, code_unit_size); + size += code_unit_size; + } + + utf8_pos = buffer_p + size; + cesu8_pos += code_unit_size; + } + + JERRY_ASSERT (utf8_pos <= utf8_end_pos); + } + + ECMA_FINALIZE_UTF8_STRING (cesu8_str_p, cesu8_str_size); + JERRY_ASSERT (size <= buffer_size); + + return size; +} /* ecma_substring_copy_to_utf8_buffer */ + /** * Convert ecma-string's contents to a cesu-8 string and put it to the buffer. * It is the caller's responsibility to make sure that the string fits in the buffer. diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index e86222548..093a6b865 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -191,6 +191,12 @@ ecma_substring_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p, ecma_length_t end_pos, lit_utf8_byte_t *buffer_p, lit_utf8_size_t buffer_size); +lit_utf8_size_t +ecma_substring_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, + ecma_length_t start_pos, + ecma_length_t end_pos, + lit_utf8_byte_t *buffer_p, + lit_utf8_size_t buffer_size); void ecma_string_to_utf8_bytes (const ecma_string_t *string_desc_p, lit_utf8_byte_t *buffer_p, lit_utf8_size_t buffer_size); const lit_utf8_byte_t *ecma_string_raw_chars (const ecma_string_t *string_p, lit_utf8_size_t *size_p, bool *is_ascii_p); diff --git a/jerry-core/jerry-api.h b/jerry-core/jerry-api.h index 4490a3db8..31436e249 100644 --- a/jerry-core/jerry-api.h +++ b/jerry-core/jerry-api.h @@ -237,6 +237,11 @@ jerry_size_t jerry_substring_to_char_buffer (const jerry_value_t value, jerry_length_t end_pos, jerry_char_t *buffer_p, jerry_size_t buffer_size); +jerry_size_t jerry_substring_to_utf8_char_buffer (const jerry_value_t value, + jerry_length_t start_pos, + jerry_length_t end_pos, + jerry_char_t *buffer_p, + jerry_size_t buffer_size); /** * Functions for array object values diff --git a/jerry-core/jerry.c b/jerry-core/jerry.c index d2e14693b..661e34624 100644 --- a/jerry-core/jerry.c +++ b/jerry-core/jerry.c @@ -1233,6 +1233,40 @@ jerry_substring_to_char_buffer (const jerry_value_t value, /**< input string val buffer_size); } /* jerry_substring_to_char_buffer */ +/** + * Copy the characters of an utf-8 encoded substring into a specified buffer. + * + * Note: + * The '\0' character could occur anywhere in the returned string + * Returns 0, if the value parameter is not a string. + * It will extract the substring beetween the specified start position + * and the end position (or the end of the string, whichever comes first). + * + * @return number of bytes copied to the buffer. + */ +jerry_size_t +jerry_substring_to_utf8_char_buffer (const jerry_value_t value, /**< input string value */ + jerry_length_t start_pos, /**< position of the first character */ + jerry_length_t end_pos, /**< position of the last character */ + jerry_char_t *buffer_p, /**< [out] output characters buffer */ + jerry_size_t buffer_size) /**< size of output buffer */ +{ + jerry_assert_api_available (); + + if (!ecma_is_value_string (value) || buffer_p == NULL) + { + return 0; + } + + ecma_string_t *str_p = ecma_get_string_from_value (value); + + return ecma_substring_copy_to_utf8_buffer (str_p, + start_pos, + end_pos, + (lit_utf8_byte_t *) buffer_p, + buffer_size); +} /* jerry_substring_to_utf8_char_buffer */ + /** * Checks whether the object or it's prototype objects have the given property. * diff --git a/tests/unit/test-api.c b/tests/unit/test-api.c index 4192b2771..81f7fd157 100644 --- a/tests/unit/test-api.c +++ b/tests/unit/test-api.c @@ -398,6 +398,41 @@ main (void) TEST_ASSERT (jerry_string_to_utf8_char_buffer (args[0], (jerry_char_t *) test_string, utf8_sz) == 14); TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", utf8_sz)); + sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length, (jerry_char_t *) test_string, utf8_sz); + TEST_ASSERT (sz == 14); + TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", sz)); + + sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length + 1, (jerry_char_t *) test_string, utf8_sz); + TEST_ASSERT (sz == 14); + TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", sz)); + + sz = jerry_substring_to_utf8_char_buffer (args[0], utf8_length, 0, (jerry_char_t *) test_string, utf8_sz); + TEST_ASSERT (sz == 0); + + sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length, (jerry_char_t *) test_string, utf8_sz - 1); + TEST_ASSERT (sz == 10); + TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 ", sz)); + + sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length - 1, (jerry_char_t *) test_string, utf8_sz); + TEST_ASSERT (sz == 10); + TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 ", sz)); + + sz = jerry_substring_to_utf8_char_buffer (args[0], + utf8_length - 2, + utf8_length - 1, + (jerry_char_t *) test_string, + utf8_sz); + TEST_ASSERT (sz == 1); + TEST_ASSERT (!strncmp (test_string, " ", sz)); + + sz = jerry_substring_to_utf8_char_buffer (args[0], + utf8_length - 3, + utf8_length - 2, + (jerry_char_t *) test_string, + utf8_sz); + TEST_ASSERT (sz == 4); + TEST_ASSERT (!strncmp (test_string, "\xf0\x9d\x94\xa3", sz)); + jerry_release_value (args[0]); /* Test string: 'str: {DESERET CAPITAL LETTER LONG I}' */