From 0467239d03b21f191f568eecff8dfd6b2fa9d9dc Mon Sep 17 00:00:00 2001 From: Robert Sipka Date: Tue, 29 Nov 2016 12:25:18 +0100 Subject: [PATCH] Add an API function to calculate the UTF-8 encoded string size from Jerry string. (#1450) JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com --- docs/02.API-REFERENCE.md | 37 ++++++++++++++ jerry-core/ecma/base/ecma-helpers-string.c | 49 +++++++++++++++++++ jerry-core/ecma/base/ecma-helpers.h | 1 + jerry-core/jerry-api.h | 1 + jerry-core/jerry.c | 21 ++++++++ jerry-core/lit/lit-strings.c | 34 +++++++++++++ jerry-core/lit/lit-strings.h | 1 + tests/unit/test-api.c | 56 +++++++++++++++++----- 8 files changed, 189 insertions(+), 11 deletions(-) diff --git a/docs/02.API-REFERENCE.md b/docs/02.API-REFERENCE.md index 3b9a7354d..96eefa7bc 100644 --- a/docs/02.API-REFERENCE.md +++ b/docs/02.API-REFERENCE.md @@ -1150,6 +1150,43 @@ jerry_get_string_size (const jerry_value_t value); - [jerry_get_string_length](#jerry_get_string_length) +## jerry_get_utf8_string_size + +**Summary** + +Get the size of an utf8-encoded string. Returns zero, if the value parameter is not a string. + +*Note*: The difference from [jerry_get_string_size](#jerry_get_string_size) is that it returns with utf-8 string size +instead of the cesu-8 string size. + +**Prototype** + +```c +jerry_size_t +jerry_get_utf8_string_size (const jerry_value_t value); +``` +- `value` - api value +- return value - number of bytes in the buffer needed to represent the utf8-encoded string. + +**Example** + +```c +{ + const jerry_char_t char_array[] = "a string"; + jerry_value_t string = jerry_create_string (char_array); + + jerry_size_t string_size = jerry_get_utf8_string_size (string); + + ... // usage of string_size + + jerry_release_value (string); +} +``` + +**See also** + +- [jerry_create_string_from_utf8](#jerry_create_string_from_utf8) + ## jerry_get_string_length **Summary** diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index a89354f5e..6f4b44129 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -1513,6 +1513,55 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */ } } /* ecma_string_get_size */ +/** + * Get the UTF-8 encoded string size from ecma-string + * + * @return number of bytes in the buffer needed to represent an UTF-8 encoded string + */ +lit_utf8_size_t +ecma_string_get_utf8_size (const ecma_string_t *string_p) /**< ecma-string */ +{ + switch (ECMA_STRING_GET_CONTAINER (string_p)) + { + case ECMA_STRING_CONTAINER_HEAP_UTF8_STRING: + { + if (string_p->u.utf8_string.size == (lit_utf8_size_t) string_p->u.utf8_string.length) + { + return (lit_utf8_size_t) string_p->u.utf8_string.size; + } + + return lit_get_utf8_size_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1), + (lit_utf8_size_t) string_p->u.utf8_string.size); + } + case ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING: + { + ecma_long_string_t *long_string_p = (ecma_long_string_t *) string_p; + if (string_p->u.long_utf8_string_size == (lit_utf8_size_t) long_string_p->long_utf8_string_length) + { + return (lit_utf8_size_t) string_p->u.long_utf8_string_size; + } + + return lit_get_utf8_size_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1), + (lit_utf8_size_t) string_p->u.long_utf8_string_size); + } + case ECMA_STRING_CONTAINER_UINT32_IN_DESC: + { + return (lit_utf8_size_t) ecma_string_get_number_in_desc_size (string_p->u.uint32_number); + } + case ECMA_STRING_CONTAINER_MAGIC_STRING: + { + return lit_get_magic_string_size (string_p->u.magic_string_id); + } + default: + { + JERRY_ASSERT (ECMA_STRING_GET_CONTAINER (string_p) == ECMA_STRING_CONTAINER_MAGIC_STRING_EX); + + return lit_get_utf8_size_of_cesu8_string (lit_get_magic_string_ex_utf8 (string_p->u.magic_string_ex_id), + lit_get_magic_string_ex_size (string_p->u.magic_string_ex_id)); + } + } +} /* ecma_string_get_utf8_size */ + /** * Get character from specified position in the ecma-string. * diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index 81bee4ce8..bdfbaac6f 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -196,6 +196,7 @@ extern bool ecma_compare_ecma_strings (const ecma_string_t *, const ecma_string_ extern bool ecma_compare_ecma_strings_relational (const ecma_string_t *, const ecma_string_t *); extern ecma_length_t ecma_string_get_length (const ecma_string_t *); extern lit_utf8_size_t ecma_string_get_size (const ecma_string_t *); +extern lit_utf8_size_t ecma_string_get_utf8_size (const ecma_string_t *); extern ecma_char_t ecma_string_get_char_at_pos (const ecma_string_t *, ecma_length_t); extern ecma_string_t *ecma_get_magic_string (lit_magic_string_id_t); diff --git a/jerry-core/jerry-api.h b/jerry-core/jerry-api.h index 8037fab7e..c94175845 100644 --- a/jerry-core/jerry-api.h +++ b/jerry-core/jerry-api.h @@ -208,6 +208,7 @@ double jerry_get_number_value (const jerry_value_t); * Functions for string values */ jerry_size_t jerry_get_string_size (const jerry_value_t); +jerry_size_t jerry_get_utf8_string_size (const jerry_value_t); jerry_length_t jerry_get_string_length (const jerry_value_t); jerry_size_t jerry_string_to_char_buffer (const jerry_value_t, jerry_char_t *, jerry_size_t); diff --git a/jerry-core/jerry.c b/jerry-core/jerry.c index cc491644c..096b44224 100644 --- a/jerry-core/jerry.c +++ b/jerry-core/jerry.c @@ -1044,6 +1044,27 @@ jerry_get_string_size (const jerry_value_t value) /**< input string */ return ecma_string_get_size (ecma_get_string_from_value (value)); } /* jerry_get_string_size */ +/** + * Get UTF-8 encoded string size from Jerry string + * + * Note: + * Returns 0, if the value parameter is not a string. + * + * @return number of bytes in the buffer needed to represent the UTF-8 encoded string + */ +jerry_size_t +jerry_get_utf8_string_size (const jerry_value_t value) +{ + jerry_assert_api_available (); + + if (!ecma_is_value_string (value)) + { + return 0; + } + + return ecma_string_get_utf8_size (ecma_get_string_from_value (value)); +} /* jerry_get_utf8_string_size */ + /** * Get length of Jerry string * diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index 26c2073d6..993fe91cd 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -281,6 +281,40 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */ return length; } /* lit_utf8_string_length */ +/** + * Calculate the required size of an utf-8 encoded string from cesu-8 encoded string + * + * @return size of an utf-8 encoded string + */ +lit_utf8_size_t +lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */ + lit_utf8_size_t cesu8_buf_size) /**< string size */ +{ + lit_utf8_size_t offset = 0; + lit_utf8_size_t utf8_buf_size = cesu8_buf_size; + + while (offset < cesu8_buf_size) + { + ecma_char_t ch; + offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch); + + if (lit_is_code_point_utf16_high_surrogate (ch) && (offset < cesu8_buf_size)) + { + ecma_char_t next_ch; + offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &next_ch); + + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + utf8_buf_size -= 2; + } + } + } + + JERRY_ASSERT (offset == cesu8_buf_size); + + return utf8_buf_size; +} /* lit_get_utf8_size_of_cesu8_string */ + /** * Decodes a unicode code point from non-empty utf-8-encoded buffer * diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index 3bd49b4b7..cf952e8a2 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -95,6 +95,7 @@ bool lit_is_code_point_utf16_high_surrogate (lit_code_point_t); /* size */ lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *); +lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *, lit_utf8_size_t); /* length */ ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *, lit_utf8_size_t); diff --git a/tests/unit/test-api.c b/tests/unit/test-api.c index f515d5514..4aed9bfbe 100644 --- a/tests/unit/test-api.c +++ b/tests/unit/test-api.c @@ -304,7 +304,8 @@ main (void) TEST_INIT (); bool is_ok; - jerry_size_t sz; + jerry_size_t sz, utf8_sz, cesu8_sz; + jerry_length_t cesu8_length; jerry_value_t val_t, val_foo, val_bar, val_A, val_A_prototype, val_a, val_a_foo, val_value_field, val_p, val_np; jerry_value_t val_call_external; jerry_value_t global_obj_val, obj_val; @@ -339,17 +340,20 @@ main (void) TEST_ASSERT (sz == 0); jerry_release_value (args[0]); - // Test create_jerry_string_from_utf8 with 4-byte long unicode sequences - args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a\xf0\x90\x90\x80"); - args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a\xed\xa0\x81\xed\xb0\x80"); + /* Test create_jerry_string_from_utf8 with 4-byte long unicode sequences, + * test string: 'str: {DESERET CAPITAL LETTER LONG I}' + */ + args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x90\x90\x80"); + args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80"); - jerry_size_t utf8_sz = jerry_get_string_size (args[0]); - jerry_size_t cesu8_sz = jerry_get_string_size (args[1]); + /* these size must be equal */ + utf8_sz = jerry_get_string_size (args[0]); + cesu8_sz = jerry_get_string_size (args[1]); char string_from_utf8[utf8_sz]; char string_from_cesu8[cesu8_sz]; - jerry_string_to_char_buffer (args[1], (jerry_char_t *) string_from_utf8, utf8_sz); + jerry_string_to_char_buffer (args[0], (jerry_char_t *) string_from_utf8, utf8_sz); jerry_string_to_char_buffer (args[1], (jerry_char_t *) string_from_cesu8, cesu8_sz); TEST_ASSERT (utf8_sz == cesu8_sz); @@ -357,11 +361,41 @@ main (void) jerry_release_value (args[0]); jerry_release_value (args[1]); - // Test create_jerry_string_from_utf8 with 4-byte long unicode sequences - args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a\xf0\x9d\x94\xa3\xf0\x9d\x94\xa4"); - jerry_length_t cesu8_length = jerry_get_string_length (args[0]); + /* Test string: 'str: {MATHEMATICAL FRAKTUR SMALL F}{MATHEMATICAL FRAKTUR SMALL G}' */ + args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4"); - TEST_ASSERT (cesu8_length == 8); + cesu8_length = jerry_get_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); + utf8_sz = jerry_get_utf8_string_size (args[0]); + + TEST_ASSERT (cesu8_length == 10); + TEST_ASSERT (cesu8_sz != utf8_sz); + TEST_ASSERT (utf8_sz == 14 && cesu8_sz == 18); + jerry_release_value (args[0]); + + /* Test string: 'str: {DESERET CAPITAL LETTER LONG I}' */ + args[0] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80"); + + cesu8_length = jerry_get_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); + utf8_sz = jerry_get_utf8_string_size (args[0]); + + TEST_ASSERT (cesu8_length == 7); + TEST_ASSERT (cesu8_sz != utf8_sz); + TEST_ASSERT (utf8_sz == 9 && cesu8_sz == 11); + + jerry_release_value (args[0]); + + /* Test string: 'price: 10{EURO SIGN}' */ + args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x70\x72\x69\x63\x65\x3a \x31\x30\xe2\x82\xac"); + + cesu8_length = jerry_get_string_length (args[0]); + cesu8_sz = jerry_get_string_size (args[0]); + utf8_sz = jerry_get_utf8_string_size (args[0]); + + TEST_ASSERT (cesu8_length == 10); + TEST_ASSERT (cesu8_sz == utf8_sz); + TEST_ASSERT (utf8_sz == 12); jerry_release_value (args[0]); // Get global.boo (non-existing field)