Copy the characters of an UTF-8 encoded substring into a specified buffer (#1524)

JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
2017-01-24 15:04:48 +01:00
parent 976c8aee80
commit 124582793f
6 changed files with 264 additions and 2 deletions
@@ -1406,7 +1406,7 @@ jerry_string_to_utf8_char_buffer (const jerry_value_t value,
 Copy the characters of a cesu-8 encoded substring into a specified buffer.
 The '\0' character could occur in character buffer. Returns 0, if the value
-parameter is not a string. It will extract the substring beetween the
+parameter is not a string. It will extract the substring between the
 specified start position and the end position (or the end of the string,
 whichever comes first).
@@ -1452,6 +1452,57 @@ jerry_substring_to_char_buffer (const jerry_value_t value,
 - [jerry_get_string_size](#jerry_get_string_size)
 - [jerry_get_string_length](#jerry_get_string_length)
 ## jerry_substring_to_utf8_char_buffer
 **Summary**
 Copy the characters of an utf-8 encoded substring into a specified buffer.
 The '\0' character could occur in character buffer. Returns 0, if the value
 parameter is not a string. It will extract the substring between the specified
 start position and the end position (or the end of the string, whichever
 comes first).
 **Prototype**
 ```c
 jerry_size_t
 jerry_substring_to_utf8_char_buffer (const jerry_value_t value,
                                     jerry_length_t start_pos,
                                     jerry_length_t end_pos,
                                     jerry_char_t *buffer_p,
                                     jerry_size_t buffer_size);
 ```
 - `value` - input string value
 - `start_pos` - position of the first character
 - `end_pos` - position of the last character
 - `buffer_p` - pointer to output buffer
 - `buffer_size` - size of the buffer
 - return value - number of bytes, actually copied to the buffer
 **Example**
 ```c
 {
  jerry_value_t value;
  ... // create or acquire value
  jerry_size_t req_sz = jerry_get_utf8_string_size (value);
  jerry_char_t str_buf_p[req_sz];
  jerry_length_t start_pos = 0;
  jerry_length_t end_pos = jerry_get_utf8_string_length (value);
  jerry_substring_to_utf8_char_buffer (value, start_pos, end_pos, str_buf_p, req_sz);
  jerry_release_value (value);
 }
 ```
 **See also**
 - [jerry_create_string_from_utf8](#jerry_create_string)
 - [jerry_get_utf8_string_size](#jerry_get_utf8_string_size)
 - [jerry_get_utf8_string_length](#jerry_get_utf8_string_length)
 # Functions for array object values
 ## jerry_get_array_length
@@ -980,7 +980,7 @@ ecma_string_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-s
 } /* ecma_string_copy_to_utf8_buffer */
 /**
- * Convert ecma-string's contents to a cesu-8 string, extract the parts of the converted string beetween the specified
+ * Convert ecma-string's contents to a cesu-8 string, extract the parts of the converted string between the specified
 * start position and the end position (or the end of the string, whichever comes first), and copy these characters
 * into the buffer.
 *
@@ -1059,6 +1059,137 @@ ecma_substring_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p, /**< ec
  return size;
 } /* ecma_substring_copy_to_cesu8_buffer */
 /**
 * Convert ecma-string's contents to an utf-8 string, extract the parts of the converted string between the specified
 * start position and the end position (or the end of the string, whichever comes first), and copy these characters
 * into the buffer.
 *
 * @return number of bytes, actually copied to the buffer.
 */
 lit_utf8_size_t
 ecma_substring_copy_to_utf8_buffer (const ecma_string_t *string_desc_p, /**< ecma-string descriptor */
                                    ecma_length_t start_pos, /**< position of the first character */
                                    ecma_length_t end_pos, /**< position of the last character */
                                    lit_utf8_byte_t *buffer_p, /**< destination buffer pointer
                                                                * (can be NULL if buffer_size == 0) */
                                    lit_utf8_size_t buffer_size) /**< size of buffer */
 {
  JERRY_ASSERT (string_desc_p != NULL);
  JERRY_ASSERT (string_desc_p->refs_and_container >= ECMA_STRING_REF_ONE);
  JERRY_ASSERT (buffer_p != NULL || buffer_size == 0);
  lit_utf8_size_t size = 0;
  ecma_length_t utf8_str_length = ecma_string_get_utf8_length (string_desc_p);
  if (start_pos >= utf8_str_length || start_pos >= end_pos)
  {
    return 0;
  }
  if (end_pos > utf8_str_length)
  {
    end_pos = utf8_str_length;
  }
  ECMA_STRING_TO_UTF8_STRING (string_desc_p, cesu8_str_p, cesu8_str_size);
  ecma_length_t cesu8_str_length = ecma_string_get_length (string_desc_p);
  if (cesu8_str_length == cesu8_str_size)
  {
    cesu8_str_p += start_pos;
    size = end_pos - start_pos;
    if (size > buffer_size)
    {
      size = buffer_size;
    }
    memcpy (buffer_p, cesu8_str_p, size);
  }
  else
  {
    const lit_utf8_byte_t *cesu8_end_pos = cesu8_str_p + cesu8_str_size;
    end_pos -= start_pos;
    while (start_pos--)
    {
      ecma_char_t ch;
      lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_str_p, &ch);
      cesu8_str_p += code_unit_size;
      if ((cesu8_str_p != cesu8_end_pos) && lit_is_code_point_utf16_high_surrogate (ch))
      {
        ecma_char_t next_ch;
        lit_utf8_size_t next_ch_size = lit_read_code_unit_from_utf8 (cesu8_str_p, &next_ch);
        if (lit_is_code_point_utf16_low_surrogate (next_ch))
        {
          JERRY_ASSERT (code_unit_size == next_ch_size);
          cesu8_str_p += code_unit_size;
        }
      }
    }
    const lit_utf8_byte_t *cesu8_pos = cesu8_str_p;
    lit_utf8_byte_t *utf8_pos = buffer_p;
    lit_utf8_byte_t *utf8_end_pos = buffer_p + buffer_size;
    while (end_pos--)
    {
      ecma_char_t ch;
      lit_utf8_size_t code_unit_size = lit_read_code_unit_from_utf8 (cesu8_pos, &ch);
      if ((size + code_unit_size) > buffer_size)
      {
        break;
      }
      if (((cesu8_pos + code_unit_size) != cesu8_end_pos) && lit_is_code_point_utf16_high_surrogate (ch))
      {
        ecma_char_t next_ch;
        lit_utf8_size_t next_ch_size = lit_read_code_unit_from_utf8 (cesu8_pos + code_unit_size, &next_ch);
        if (lit_is_code_point_utf16_low_surrogate (next_ch))
        {
          JERRY_ASSERT (code_unit_size == next_ch_size);
          if ((size + code_unit_size + 1) > buffer_size)
          {
            break;
          }
          cesu8_pos += next_ch_size;
          lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (ch, next_ch);
          lit_code_point_to_utf8 (code_point, utf8_pos);
          size += (code_unit_size + 1);
        }
        else
        {
          memcpy (utf8_pos, cesu8_pos, code_unit_size);
          size += code_unit_size;
        }
      }
      else
      {
        memcpy (utf8_pos, cesu8_pos, code_unit_size);
        size += code_unit_size;
      }
      utf8_pos = buffer_p + size;
      cesu8_pos += code_unit_size;
    }
    JERRY_ASSERT (utf8_pos <= utf8_end_pos);
  }
  ECMA_FINALIZE_UTF8_STRING (cesu8_str_p, cesu8_str_size);
  JERRY_ASSERT (size <= buffer_size);
  return size;
 } /* ecma_substring_copy_to_utf8_buffer */
 /**
 * Convert ecma-string's contents to a cesu-8 string and put it to the buffer.
 * It is the caller's responsibility to make sure that the string fits in the buffer.
@@ -191,6 +191,12 @@ ecma_substring_copy_to_cesu8_buffer (const ecma_string_t *string_desc_p,
                                     ecma_length_t end_pos,
                                     lit_utf8_byte_t *buffer_p,
                                     lit_utf8_size_t buffer_size);
 lit_utf8_size_t
 ecma_substring_copy_to_utf8_buffer (const ecma_string_t *string_desc_p,
                                    ecma_length_t start_pos,
                                    ecma_length_t end_pos,
                                    lit_utf8_byte_t *buffer_p,
                                    lit_utf8_size_t buffer_size);
 void ecma_string_to_utf8_bytes (const ecma_string_t *string_desc_p, lit_utf8_byte_t *buffer_p,
                                lit_utf8_size_t buffer_size);
 const lit_utf8_byte_t *ecma_string_raw_chars (const ecma_string_t *string_p, lit_utf8_size_t *size_p, bool *is_ascii_p);
@@ -237,6 +237,11 @@ jerry_size_t jerry_substring_to_char_buffer (const jerry_value_t value,
                                             jerry_length_t end_pos,
                                             jerry_char_t *buffer_p,
                                             jerry_size_t buffer_size);
 jerry_size_t jerry_substring_to_utf8_char_buffer (const jerry_value_t value,
                                                  jerry_length_t start_pos,
                                                  jerry_length_t end_pos,
                                                  jerry_char_t *buffer_p,
                                                  jerry_size_t buffer_size);
 /**
 * Functions for array object values
@@ -1233,6 +1233,40 @@ jerry_substring_to_char_buffer (const jerry_value_t value, /**< input string val
                                              buffer_size);
 } /* jerry_substring_to_char_buffer */
 /**
 * Copy the characters of an utf-8 encoded substring into a specified buffer.
 *
 * Note:
 *      The '\0' character could occur anywhere in the returned string
 *      Returns 0, if the value parameter is not a string.
 *      It will extract the substring beetween the specified start position
 *      and the end position (or the end of the string, whichever comes first).
 *
 * @return number of bytes copied to the buffer.
 */
 jerry_size_t
 jerry_substring_to_utf8_char_buffer (const jerry_value_t value, /**< input string value */
                                     jerry_length_t start_pos, /**< position of the first character */
                                     jerry_length_t end_pos, /**< position of the last character */
                                     jerry_char_t *buffer_p, /**< [out] output characters buffer */
                                     jerry_size_t buffer_size) /**< size of output buffer */
 {
  jerry_assert_api_available ();
  if (!ecma_is_value_string (value) || buffer_p == NULL)
  {
    return 0;
  }
  ecma_string_t *str_p = ecma_get_string_from_value (value);
  return ecma_substring_copy_to_utf8_buffer (str_p,
                                             start_pos,
                                             end_pos,
                                             (lit_utf8_byte_t *) buffer_p,
                                             buffer_size);
 } /* jerry_substring_to_utf8_char_buffer */
 /**
 * Checks whether the object or it's prototype objects have the given property.
 *
@@ -398,6 +398,41 @@ main (void)
  TEST_ASSERT (jerry_string_to_utf8_char_buffer (args[0], (jerry_char_t *) test_string, utf8_sz) == 14);
  TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", utf8_sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length, (jerry_char_t *) test_string, utf8_sz);
  TEST_ASSERT (sz == 14);
  TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length + 1, (jerry_char_t *) test_string, utf8_sz);
  TEST_ASSERT (sz == 14);
  TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4", sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0], utf8_length, 0, (jerry_char_t *) test_string, utf8_sz);
  TEST_ASSERT (sz == 0);
  sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length, (jerry_char_t *) test_string, utf8_sz - 1);
  TEST_ASSERT (sz == 10);
  TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 ", sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0], 0, utf8_length - 1, (jerry_char_t *) test_string, utf8_sz);
  TEST_ASSERT (sz == 10);
  TEST_ASSERT (!strncmp (test_string, "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 ", sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0],
                                            utf8_length - 2,
                                            utf8_length - 1,
                                            (jerry_char_t *) test_string,
                                            utf8_sz);
  TEST_ASSERT (sz == 1);
  TEST_ASSERT (!strncmp (test_string, " ", sz));
  sz = jerry_substring_to_utf8_char_buffer (args[0],
                                            utf8_length - 3,
                                            utf8_length - 2,
                                            (jerry_char_t *) test_string,
                                            utf8_sz);
  TEST_ASSERT (sz == 4);
  TEST_ASSERT (!strncmp (test_string, "\xf0\x9d\x94\xa3", sz));
  jerry_release_value (args[0]);
  /* Test string: 'str: {DESERET CAPITAL LETTER LONG I}' */