Add an API function to calculate the UTF-8 encoded string size from Jerry string. (#1450)

JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
This commit is contained in:
Robert Sipka
2016-11-29 12:25:18 +01:00
committed by GitHub
parent 6a2f54456f
commit 0467239d03
8 changed files with 189 additions and 11 deletions
+37
View File
@@ -1150,6 +1150,43 @@ jerry_get_string_size (const jerry_value_t value);
- [jerry_get_string_length](#jerry_get_string_length)
## jerry_get_utf8_string_size
**Summary**
Get the size of an utf8-encoded string. Returns zero, if the value parameter is not a string.
*Note*: The difference from [jerry_get_string_size](#jerry_get_string_size) is that it returns with utf-8 string size
instead of the cesu-8 string size.
**Prototype**
```c
jerry_size_t
jerry_get_utf8_string_size (const jerry_value_t value);
```
- `value` - api value
- return value - number of bytes in the buffer needed to represent the utf8-encoded string.
**Example**
```c
{
const jerry_char_t char_array[] = "a string";
jerry_value_t string = jerry_create_string (char_array);
jerry_size_t string_size = jerry_get_utf8_string_size (string);
... // usage of string_size
jerry_release_value (string);
}
```
**See also**
- [jerry_create_string_from_utf8](#jerry_create_string_from_utf8)
## jerry_get_string_length
**Summary**
@@ -1513,6 +1513,55 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
}
} /* ecma_string_get_size */
/**
* Get the UTF-8 encoded string size from ecma-string
*
* @return number of bytes in the buffer needed to represent an UTF-8 encoded string
*/
lit_utf8_size_t
ecma_string_get_utf8_size (const ecma_string_t *string_p) /**< ecma-string */
{
switch (ECMA_STRING_GET_CONTAINER (string_p))
{
case ECMA_STRING_CONTAINER_HEAP_UTF8_STRING:
{
if (string_p->u.utf8_string.size == (lit_utf8_size_t) string_p->u.utf8_string.length)
{
return (lit_utf8_size_t) string_p->u.utf8_string.size;
}
return lit_get_utf8_size_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1),
(lit_utf8_size_t) string_p->u.utf8_string.size);
}
case ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING:
{
ecma_long_string_t *long_string_p = (ecma_long_string_t *) string_p;
if (string_p->u.long_utf8_string_size == (lit_utf8_size_t) long_string_p->long_utf8_string_length)
{
return (lit_utf8_size_t) string_p->u.long_utf8_string_size;
}
return lit_get_utf8_size_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1),
(lit_utf8_size_t) string_p->u.long_utf8_string_size);
}
case ECMA_STRING_CONTAINER_UINT32_IN_DESC:
{
return (lit_utf8_size_t) ecma_string_get_number_in_desc_size (string_p->u.uint32_number);
}
case ECMA_STRING_CONTAINER_MAGIC_STRING:
{
return lit_get_magic_string_size (string_p->u.magic_string_id);
}
default:
{
JERRY_ASSERT (ECMA_STRING_GET_CONTAINER (string_p) == ECMA_STRING_CONTAINER_MAGIC_STRING_EX);
return lit_get_utf8_size_of_cesu8_string (lit_get_magic_string_ex_utf8 (string_p->u.magic_string_ex_id),
lit_get_magic_string_ex_size (string_p->u.magic_string_ex_id));
}
}
} /* ecma_string_get_utf8_size */
/**
* Get character from specified position in the ecma-string.
*
+1
View File
@@ -196,6 +196,7 @@ extern bool ecma_compare_ecma_strings (const ecma_string_t *, const ecma_string_
extern bool ecma_compare_ecma_strings_relational (const ecma_string_t *, const ecma_string_t *);
extern ecma_length_t ecma_string_get_length (const ecma_string_t *);
extern lit_utf8_size_t ecma_string_get_size (const ecma_string_t *);
extern lit_utf8_size_t ecma_string_get_utf8_size (const ecma_string_t *);
extern ecma_char_t ecma_string_get_char_at_pos (const ecma_string_t *, ecma_length_t);
extern ecma_string_t *ecma_get_magic_string (lit_magic_string_id_t);
+1
View File
@@ -208,6 +208,7 @@ double jerry_get_number_value (const jerry_value_t);
* Functions for string values
*/
jerry_size_t jerry_get_string_size (const jerry_value_t);
jerry_size_t jerry_get_utf8_string_size (const jerry_value_t);
jerry_length_t jerry_get_string_length (const jerry_value_t);
jerry_size_t jerry_string_to_char_buffer (const jerry_value_t, jerry_char_t *, jerry_size_t);
+21
View File
@@ -1044,6 +1044,27 @@ jerry_get_string_size (const jerry_value_t value) /**< input string */
return ecma_string_get_size (ecma_get_string_from_value (value));
} /* jerry_get_string_size */
/**
* Get UTF-8 encoded string size from Jerry string
*
* Note:
* Returns 0, if the value parameter is not a string.
*
* @return number of bytes in the buffer needed to represent the UTF-8 encoded string
*/
jerry_size_t
jerry_get_utf8_string_size (const jerry_value_t value)
{
jerry_assert_api_available ();
if (!ecma_is_value_string (value))
{
return 0;
}
return ecma_string_get_utf8_size (ecma_get_string_from_value (value));
} /* jerry_get_utf8_string_size */
/**
* Get length of Jerry string
*
+34
View File
@@ -281,6 +281,40 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
return length;
} /* lit_utf8_string_length */
/**
* Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
*
* @return size of an utf-8 encoded string
*/
lit_utf8_size_t
lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
lit_utf8_size_t cesu8_buf_size) /**< string size */
{
lit_utf8_size_t offset = 0;
lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
while (offset < cesu8_buf_size)
{
ecma_char_t ch;
offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
if (lit_is_code_point_utf16_high_surrogate (ch) && (offset < cesu8_buf_size))
{
ecma_char_t next_ch;
offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &next_ch);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
utf8_buf_size -= 2;
}
}
}
JERRY_ASSERT (offset == cesu8_buf_size);
return utf8_buf_size;
} /* lit_get_utf8_size_of_cesu8_string */
/**
* Decodes a unicode code point from non-empty utf-8-encoded buffer
*
+1
View File
@@ -95,6 +95,7 @@ bool lit_is_code_point_utf16_high_surrogate (lit_code_point_t);
/* size */
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *, lit_utf8_size_t);
/* length */
ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *, lit_utf8_size_t);
+45 -11
View File
@@ -304,7 +304,8 @@ main (void)
TEST_INIT ();
bool is_ok;
jerry_size_t sz;
jerry_size_t sz, utf8_sz, cesu8_sz;
jerry_length_t cesu8_length;
jerry_value_t val_t, val_foo, val_bar, val_A, val_A_prototype, val_a, val_a_foo, val_value_field, val_p, val_np;
jerry_value_t val_call_external;
jerry_value_t global_obj_val, obj_val;
@@ -339,17 +340,20 @@ main (void)
TEST_ASSERT (sz == 0);
jerry_release_value (args[0]);
// Test create_jerry_string_from_utf8 with 4-byte long unicode sequences
args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a\xf0\x90\x90\x80");
args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a\xed\xa0\x81\xed\xb0\x80");
/* Test create_jerry_string_from_utf8 with 4-byte long unicode sequences,
* test string: 'str: {DESERET CAPITAL LETTER LONG I}'
*/
args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x90\x90\x80");
args[1] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80");
jerry_size_t utf8_sz = jerry_get_string_size (args[0]);
jerry_size_t cesu8_sz = jerry_get_string_size (args[1]);
/* these size must be equal */
utf8_sz = jerry_get_string_size (args[0]);
cesu8_sz = jerry_get_string_size (args[1]);
char string_from_utf8[utf8_sz];
char string_from_cesu8[cesu8_sz];
jerry_string_to_char_buffer (args[1], (jerry_char_t *) string_from_utf8, utf8_sz);
jerry_string_to_char_buffer (args[0], (jerry_char_t *) string_from_utf8, utf8_sz);
jerry_string_to_char_buffer (args[1], (jerry_char_t *) string_from_cesu8, cesu8_sz);
TEST_ASSERT (utf8_sz == cesu8_sz);
@@ -357,11 +361,41 @@ main (void)
jerry_release_value (args[0]);
jerry_release_value (args[1]);
// Test create_jerry_string_from_utf8 with 4-byte long unicode sequences
args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a\xf0\x9d\x94\xa3\xf0\x9d\x94\xa4");
jerry_length_t cesu8_length = jerry_get_string_length (args[0]);
/* Test string: 'str: {MATHEMATICAL FRAKTUR SMALL F}{MATHEMATICAL FRAKTUR SMALL G}' */
args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4");
TEST_ASSERT (cesu8_length == 8);
cesu8_length = jerry_get_string_length (args[0]);
cesu8_sz = jerry_get_string_size (args[0]);
utf8_sz = jerry_get_utf8_string_size (args[0]);
TEST_ASSERT (cesu8_length == 10);
TEST_ASSERT (cesu8_sz != utf8_sz);
TEST_ASSERT (utf8_sz == 14 && cesu8_sz == 18);
jerry_release_value (args[0]);
/* Test string: 'str: {DESERET CAPITAL LETTER LONG I}' */
args[0] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80");
cesu8_length = jerry_get_string_length (args[0]);
cesu8_sz = jerry_get_string_size (args[0]);
utf8_sz = jerry_get_utf8_string_size (args[0]);
TEST_ASSERT (cesu8_length == 7);
TEST_ASSERT (cesu8_sz != utf8_sz);
TEST_ASSERT (utf8_sz == 9 && cesu8_sz == 11);
jerry_release_value (args[0]);
/* Test string: 'price: 10{EURO SIGN}' */
args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x70\x72\x69\x63\x65\x3a \x31\x30\xe2\x82\xac");
cesu8_length = jerry_get_string_length (args[0]);
cesu8_sz = jerry_get_string_size (args[0]);
utf8_sz = jerry_get_utf8_string_size (args[0]);
TEST_ASSERT (cesu8_length == 10);
TEST_ASSERT (cesu8_sz == utf8_sz);
TEST_ASSERT (utf8_sz == 12);
jerry_release_value (args[0]);
// Get global.boo (non-existing field)