diff --git a/docs/02.API-REFERENCE.md b/docs/02.API-REFERENCE.md index 96eefa7bc..708bde012 100644 --- a/docs/02.API-REFERENCE.md +++ b/docs/02.API-REFERENCE.md @@ -1186,6 +1186,7 @@ jerry_get_utf8_string_size (const jerry_value_t value); **See also** - [jerry_create_string_from_utf8](#jerry_create_string_from_utf8) +- [jerry_get_utf8_string_length](#jerry_get_utf8_string_length) ## jerry_get_string_length @@ -1223,6 +1224,44 @@ jerry_get_string_length (const jerry_value_t value); - [jerry_create_string](#jerry_create_string) - [jerry_get_string_size](#jerry_get_string_size) +## jerry_get_utf8_string_length + +**Summary** + +Get the length of an UTF-8 encoded string. Returns zero, if the value parameter is not a string. + +*Note*: The difference from [jerry_get_string_length](#jerry_get_string_length) is that it +returns with utf-8 string length instead of the cesu-8 string length. + +**Prototype** + +```c +jerry_length_t +jerry_get_utf8_string_length (const jerry_value_t value); +``` + +- `value` - input string value +- return value - number of characters in the string + +**Example** + +```c +{ + const jerry_char_t char_array[] = "a string"; + jerry_value_t string = jerry_create_string_from_utf8 (char_array); + + jerry_length_t string_length = jerry_get_utf8_string_length (string); + + ... // usage of string_length + + jerry_release_value (string); +} +``` + +**See also** + +- [jerry_create_string_from_utf8](#jerry_create_string_from_utf8) +- [jerry_get_utf8_string_size](#jerry_get_utf8_string_size) ## jerry_string_to_char_buffer diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 6f4b44129..2d859c0db 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -1479,6 +1479,57 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */ } } /* ecma_string_get_length */ +/** + * Get length of UTF-8 encoded string length from ecma-string + * + * @return number of characters in the UTF-8 encoded string + */ +ecma_length_t +ecma_string_get_utf8_length (const ecma_string_t *string_p) /**< ecma-string */ +{ + switch (ECMA_STRING_GET_CONTAINER (string_p)) + { + case ECMA_STRING_CONTAINER_HEAP_UTF8_STRING: + { + if (string_p->u.utf8_string.size == (lit_utf8_size_t) string_p->u.utf8_string.length) + { + return (ecma_length_t) (string_p->u.utf8_string.length); + } + + return lit_get_utf8_length_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1), + (lit_utf8_size_t) string_p->u.utf8_string.size); + } + case ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING: + { + ecma_long_string_t *long_string_p = (ecma_long_string_t *) string_p; + if (string_p->u.long_utf8_string_size == (lit_utf8_size_t) long_string_p->long_utf8_string_length) + { + return (ecma_length_t) (long_string_p->long_utf8_string_length); + } + + return lit_get_utf8_length_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1), + (lit_utf8_size_t) string_p->u.long_utf8_string_size); + } + case ECMA_STRING_CONTAINER_UINT32_IN_DESC: + { + return ecma_string_get_number_in_desc_size (string_p->u.uint32_number); + } + case ECMA_STRING_CONTAINER_MAGIC_STRING: + { + JERRY_ASSERT (ECMA_STRING_IS_ASCII (lit_get_magic_string_utf8 (string_p->u.magic_string_id), + lit_get_magic_string_size (string_p->u.magic_string_id))); + return lit_get_magic_string_size (string_p->u.magic_string_id); + } + default: + { + JERRY_ASSERT (ECMA_STRING_GET_CONTAINER (string_p) == ECMA_STRING_CONTAINER_MAGIC_STRING_EX); + + return lit_get_utf8_length_of_cesu8_string (lit_get_magic_string_ex_utf8 (string_p->u.magic_string_ex_id), + lit_get_magic_string_ex_size (string_p->u.magic_string_ex_id)); + } + } +} /* ecma_string_get_utf8_length */ + /** * Get size of ecma-string * diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index bdfbaac6f..a4997132d 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -195,6 +195,7 @@ extern bool ecma_string_compare_to_property_name (ecma_property_t, jmem_cpointer extern bool ecma_compare_ecma_strings (const ecma_string_t *, const ecma_string_t *); extern bool ecma_compare_ecma_strings_relational (const ecma_string_t *, const ecma_string_t *); extern ecma_length_t ecma_string_get_length (const ecma_string_t *); +extern ecma_length_t ecma_string_get_utf8_length (const ecma_string_t *); extern lit_utf8_size_t ecma_string_get_size (const ecma_string_t *); extern lit_utf8_size_t ecma_string_get_utf8_size (const ecma_string_t *); extern ecma_char_t ecma_string_get_char_at_pos (const ecma_string_t *, ecma_length_t); diff --git a/jerry-core/jerry-api.h b/jerry-core/jerry-api.h index c94175845..fa9dabaf4 100644 --- a/jerry-core/jerry-api.h +++ b/jerry-core/jerry-api.h @@ -210,6 +210,7 @@ double jerry_get_number_value (const jerry_value_t); jerry_size_t jerry_get_string_size (const jerry_value_t); jerry_size_t jerry_get_utf8_string_size (const jerry_value_t); jerry_length_t jerry_get_string_length (const jerry_value_t); +jerry_length_t jerry_get_utf8_string_length (const jerry_value_t); jerry_size_t jerry_string_to_char_buffer (const jerry_value_t, jerry_char_t *, jerry_size_t); /** diff --git a/jerry-core/jerry.c b/jerry-core/jerry.c index 096b44224..0f29a9e89 100644 --- a/jerry-core/jerry.c +++ b/jerry-core/jerry.c @@ -1086,6 +1086,27 @@ jerry_get_string_length (const jerry_value_t value) /**< input string */ return ecma_string_get_length (ecma_get_string_from_value (value)); } /* jerry_get_string_length */ +/** + * Get UTF-8 string length from Jerry string + * + * Note: + * Returns 0, if the value parameter is not a string. + * + * @return number of characters in the string + */ +jerry_length_t +jerry_get_utf8_string_length (const jerry_value_t value) /**< input string */ +{ + jerry_assert_api_available (); + + if (!ecma_is_value_string (value)) + { + return 0; + } + + return ecma_string_get_utf8_length (ecma_get_string_from_value (value)); +} /* jerry_get_utf8_string_length */ + /** * Copy the characters of a string into a specified buffer. * diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index e65e124d7..5083a9093 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -292,22 +292,19 @@ lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu { lit_utf8_size_t offset = 0; lit_utf8_size_t utf8_buf_size = cesu8_buf_size; + ecma_char_t prev_ch = 0; while (offset < cesu8_buf_size) { ecma_char_t ch; offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); - if (lit_is_code_point_utf16_high_surrogate (ch) && (offset < cesu8_buf_size)) + if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch)) { - ecma_char_t next_ch; - offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &next_ch); - - if (lit_is_code_point_utf16_low_surrogate (next_ch)) - { - utf8_buf_size -= 2; - } + utf8_buf_size -= 2; } + + prev_ch = ch; } JERRY_ASSERT (offset == cesu8_buf_size); @@ -315,6 +312,37 @@ lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu return utf8_buf_size; } /* lit_get_utf8_size_of_cesu8_string */ +/** + * Calculate length of an utf-8 encoded string from cesu-8 encoded string + * + * @return length of an utf-8 encoded string + */ +ecma_length_t +lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ + lit_utf8_size_t cesu8_buf_size) /**< string size */ +{ + lit_utf8_size_t offset = 0; + ecma_length_t utf8_length = 0; + ecma_char_t prev_ch = 0; + + while (offset < cesu8_buf_size) + { + ecma_char_t ch; + offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); + + if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch)) + { + utf8_length++; + } + + prev_ch = ch; + } + + JERRY_ASSERT (offset == cesu8_buf_size); + + return utf8_length; +} /* lit_get_utf8_length_of_cesu8_string */ + /** * Decodes a unicode code point from non-empty utf-8-encoded buffer * diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index cf952e8a2..e89670203 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -99,6 +99,7 @@ lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *, lit_ /* length */ ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *, lit_utf8_size_t); +ecma_length_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *, lit_utf8_size_t); /* hash */ lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *, lit_utf8_size_t); diff --git a/tests/unit/test-api.c b/tests/unit/test-api.c index a894e5bf5..6e74be0ab 100644 --- a/tests/unit/test-api.c +++ b/tests/unit/test-api.c @@ -305,7 +305,7 @@ main (void) bool is_ok; jerry_size_t sz, utf8_sz, cesu8_sz; - jerry_length_t cesu8_length; + jerry_length_t cesu8_length, utf8_length; jerry_value_t val_t, val_foo, val_bar, val_A, val_A_prototype, val_a, val_a_foo, val_value_field, val_p, val_np; jerry_value_t val_call_external; jerry_value_t global_obj_val, obj_val; @@ -365,10 +365,12 @@ main (void) args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4"); cesu8_length = jerry_get_string_length (args[0]); + utf8_length = jerry_get_utf8_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); utf8_sz = jerry_get_utf8_string_size (args[0]); - TEST_ASSERT (cesu8_length == 10); + TEST_ASSERT (cesu8_length == 10 && utf8_length == 8); TEST_ASSERT (cesu8_sz != utf8_sz); TEST_ASSERT (utf8_sz == 14 && cesu8_sz == 18); jerry_release_value (args[0]); @@ -377,10 +379,12 @@ main (void) args[0] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80"); cesu8_length = jerry_get_string_length (args[0]); + utf8_length = jerry_get_utf8_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); utf8_sz = jerry_get_utf8_string_size (args[0]); - TEST_ASSERT (cesu8_length == 7); + TEST_ASSERT (cesu8_length == 7 && utf8_length == 6); TEST_ASSERT (cesu8_sz != utf8_sz); TEST_ASSERT (utf8_sz == 9 && cesu8_sz == 11); @@ -390,9 +394,12 @@ main (void) args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x70\x72\x69\x63\x65\x3a \x31\x30\xe2\x82\xac"); cesu8_length = jerry_get_string_length (args[0]); + utf8_length = jerry_get_utf8_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); utf8_sz = jerry_get_utf8_string_size (args[0]); + TEST_ASSERT (cesu8_length == utf8_length); TEST_ASSERT (cesu8_length == 10); TEST_ASSERT (cesu8_sz == utf8_sz); TEST_ASSERT (utf8_sz == 12);