Change internal encoding of strings to CESU-8

JerryScript-DCO-1.0-Signed-off-by: Zsolt Borbély zsborbely.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai.u-szeged@partner.samsung.com
2015-09-03 13:32:17 +02:00
parent 08c618e8c5
commit dcd610b305
14 changed files with 631 additions and 421 deletions
@@ -414,7 +414,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
                                lit_utf8_size_t string_size) /**< string size */
 {
  JERRY_ASSERT (string_p != NULL || string_size == 0);
-  JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
+  JERRY_ASSERT (lit_is_cesu8_string_valid (string_p, string_size));

  lit_magic_string_id_t magic_string_id;
  if (lit_is_utf8_string_magic (string_p, string_size, &magic_string_id))
@@ -444,7 +444,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
 } /* ecma_new_ecma_string_from_utf8 */

 /**
- * Allocate new ecma-string and fill it with utf-8 character which represents specified code unit
+ * Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
 *
 * @return pointer to ecma-string descriptor
 */
@@ -627,14 +627,7 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
    jerry_fatal (ERR_OUT_OF_MEMORY);
  }

-  ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
-  ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);
-
-  bool is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
-                                   && lit_is_code_unit_low_surrogate (str2_first_code_unit));
-
-  lit_utf8_size_t buffer_size = str1_size + str2_size - (lit_utf8_size_t) (is_surrogate_pair_sliced ?
-                                                                           LIT_UTF8_CESU8_SURROGATE_SIZE_DIF : 0);
+  lit_utf8_size_t buffer_size = str1_size + str2_size;

  lit_utf8_byte_t *str_p = (lit_utf8_byte_t *) mem_heap_alloc_block (buffer_size, MEM_HEAP_ALLOC_SHORT_TERM);

@@ -643,23 +636,9 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
  bytes_copied1 = ecma_string_to_utf8_string (string1_p, str_p, (ssize_t) str1_size);
  JERRY_ASSERT (bytes_copied1 > 0);

-  if (!is_surrogate_pair_sliced)
-  {
-    bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
-    JERRY_ASSERT (bytes_copied2 > 0);
-  }
-  else
-  {
-    bytes_copied2 = ecma_string_to_utf8_string (string2_p,
-                                                str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + 1,
-                                                (ssize_t) buffer_size - bytes_copied1
-                                                + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
-    JERRY_ASSERT (bytes_copied2 > 0);
+  bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
+  JERRY_ASSERT (bytes_copied2 > 0);

-    lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
-                                                                                      str2_first_code_unit);
-    lit_code_point_to_utf8 (surrogate_code_point, str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
-  }
  ecma_string_t *str_concat_p = ecma_new_ecma_string_from_utf8 (str_p, buffer_size);

  mem_heap_free_block ((void*) str_p);
@@ -955,7 +934,7 @@ ecma_string_get_array_index (const ecma_string_t *str_p, /**< ecma-string */
 } /* ecma_string_is_array_index */

 /**
- * Convert ecma-string's contents to a utf-8 string and put it to the buffer.
+ * Convert ecma-string's contents to a cesu-8 string and put it to the buffer.
 *
 * @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
 *         otherwise (in case size of buffer is insufficient) - negative number, which is calculated
@@ -1018,7 +997,6 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string

      break;
    }
-
    case ECMA_STRING_CONTAINER_MAGIC_STRING:
    {
      const lit_magic_string_id_t id = string_desc_p->u.magic_string_id;
@@ -1491,7 +1469,7 @@ ecma_string_get_char_at_pos (const ecma_string_t *string_p, /**< ecma-string */
  ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
  JERRY_ASSERT (sz > 0);

-  ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);;
+  ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);

  MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);

@@ -1682,10 +1660,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
  JERRY_ASSERT (end_pos <= string_length);
 #endif

-  const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
-  const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;
-
-  if (utf8_str_size)
+  if (start_pos < end_pos)
  {
    /**
     * I. Dump original string to plain buffer
@@ -1701,20 +1676,22 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
    /**
     * II. Extract substring
     */
-    MEM_DEFINE_LOCAL_ARRAY (utf8_substr_buffer, utf8_str_size, lit_utf8_byte_t);
+    lit_utf8_byte_t *start_p = utf8_str_p;
+    end_pos -= start_pos;

-    lit_utf8_size_t utf8_substr_buffer_offset = 0;
-    for (ecma_length_t idx = 0; idx < span; idx++)
+    while (start_pos--)
    {
-      ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
-
-      JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
-      utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
+      start_p += lit_get_unicode_char_size_by_utf8_first_byte (*start_p);
    }

-    ecma_string_p = ecma_new_ecma_string_from_utf8 (utf8_substr_buffer, utf8_substr_buffer_offset);
+    lit_utf8_byte_t *end_p = start_p;
+    while (end_pos--)
+    {
+      end_p += lit_get_unicode_char_size_by_utf8_first_byte (*end_p);
+    }
+
+    ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p, (lit_utf8_size_t) (end_p - start_p));

-    MEM_FINALIZE_LOCAL_ARRAY (utf8_substr_buffer);
    MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);

    return ecma_string_p;
@@ -1746,47 +1723,47 @@ ecma_string_trim (const ecma_string_t *string_p) /**< pointer to an ecma string
    ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
    JERRY_ASSERT (sz >= 0);

-    lit_utf8_iterator_t front = lit_utf8_iterator_create (utf8_str_p, buffer_size);
-
-    lit_utf8_iterator_t back = lit_utf8_iterator_create (utf8_str_p, buffer_size);
-    lit_utf8_iterator_seek_eos (&back);
-
-    lit_utf8_iterator_pos_t start = lit_utf8_iterator_get_pos (&back);
-    lit_utf8_iterator_pos_t end = lit_utf8_iterator_get_pos (&front);
-
-    ecma_char_t current;
+    ecma_char_t ch;
+    lit_utf8_size_t read_size;
+    lit_utf8_byte_t *nonws_start_p = utf8_str_p + buffer_size;
+    lit_utf8_byte_t *current_p = utf8_str_p;

    /* Trim front. */
-    while (!lit_utf8_iterator_is_eos (&front))
+    while (current_p < nonws_start_p)
    {
-      current = lit_utf8_iterator_read_next (&front);
-      if (!lit_char_is_white_space (current)
-          && !lit_char_is_line_terminator (current))
+      read_size = lit_read_code_unit_from_utf8 (current_p, &ch);
+
+      if (!lit_char_is_white_space (ch)
+          && !lit_char_is_line_terminator (ch))
      {
-        lit_utf8_iterator_decr (&front);
-        start = lit_utf8_iterator_get_pos (&front);
+        nonws_start_p = current_p;
        break;
      }
+
+      current_p += read_size;
    }

+    current_p = utf8_str_p + buffer_size;
+
    /* Trim back. */
-    while (!lit_utf8_iterator_is_bos (&back))
+    while (current_p > utf8_str_p)
    {
-      current = lit_utf8_iterator_read_prev (&back);
-      if (!lit_char_is_white_space (current)
-          && !lit_char_is_line_terminator (current))
+      read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);
+
+      if (!lit_char_is_white_space (ch)
+          && !lit_char_is_line_terminator (ch))
      {
-        lit_utf8_iterator_incr (&back);
-        end = lit_utf8_iterator_get_pos (&back);
        break;
      }
+
+      current_p -= read_size;
    }

    /* Construct new string. */
-    if (end.offset > start.offset)
+    if (current_p > nonws_start_p)
    {
-      ret_string_p = ecma_new_ecma_string_from_utf8 (utf8_str_p + start.offset,
-                                                     (lit_utf8_size_t) (end.offset - start.offset));
+      ret_string_p = ecma_new_ecma_string_from_utf8 (nonws_start_p,
+                                                     (lit_utf8_size_t) (current_p - nonws_start_p));
    }
    else
    {