Do not copy source string by JSON parser. (#1481)

The JSON parser required a zero terminated writable copy of the original string. The requirement is eliminated from the project to reduce peak memory consumption. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
2016-12-13 08:36:01 +01:00
parent fb2818c137
commit 6904b9bd65
2 changed files with 191 additions and 92 deletions
@@ -29,6 +29,7 @@
 #include "jrt.h"
 #include "jrt-libc-includes.h"
 #include "lit-char-helpers.h"
+#include "lit-globals.h"

 #ifndef CONFIG_DISABLE_JSON_BUILTIN

@@ -74,15 +75,15 @@ typedef enum
 typedef struct
 {
  ecma_json_token_type_t type; /**< type of the current token */
-  lit_utf8_byte_t *current_p; /**< current position of the string processed by the parser */
+  const lit_utf8_byte_t *current_p; /**< current position of the string processed by the parser */
  const lit_utf8_byte_t *end_p; /**< end of the string processed by the parser */
+
+  /**
+   * Fields depending on type.
+   */
  union
  {
-    struct
-    {
-      const lit_utf8_byte_t *start_p; /**< when type is string_token, it contains the start of the string */
-      lit_utf8_size_t size; /**< when type is string_token, it contains the size of the string */
-    } string;
+    ecma_string_t *string_p; /**< when type is string_token it contains the string */
    ecma_number_t number; /**< when type is number_token, it contains the value of the number */
  } u;
 } ecma_json_token_t;
@@ -93,19 +94,21 @@ typedef struct
 * @return true if the match is successful
 */
 static bool
-ecma_builtin_json_check_id (lit_utf8_byte_t *string_p, /**< start position */
-                            const char *id_p) /**< string identifier */
+ecma_builtin_json_check_id (const lit_utf8_byte_t *string_p, /**< start position */
+                            const lit_utf8_byte_t *end_p, /**< input end */
+                            const char *string_id_p) /**< string identifier */
 {
  /*
   * String comparison must not depend on lit_utf8_byte_t definition.
   */
-  JERRY_ASSERT (*string_p == *id_p);
+  JERRY_ASSERT (*string_p == *string_id_p);

-  do
+  string_p++;
+  string_id_p++;
+
+  while (string_p < end_p)
  {
-    string_p++;
-    id_p++;
-    if (*id_p == LIT_CHAR_NULL)
+    if (*string_id_p == LIT_CHAR_NULL)
    {
      /* JSON lexer accepts input strings such as falsenull and
       * returns with multiple tokens (false and null in this case).
@@ -116,10 +119,17 @@ ecma_builtin_json_check_id (lit_utf8_byte_t *string_p, /**< start position */
       * type. */
      return true;
    }
-  }
-  while (*string_p == *id_p);

-  return false;
+    if (*string_p != *string_id_p)
+    {
+      return false;
+    }
+
+    string_p++;
+    string_id_p++;
+  }
+
+  return (*string_id_p == LIT_CHAR_NULL);
 } /* ecma_builtin_json_check_id */

 /**
@@ -128,51 +138,40 @@ ecma_builtin_json_check_id (lit_utf8_byte_t *string_p, /**< start position */
 static void
 ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument */
 {
-  lit_utf8_byte_t *current_p = token_p->current_p;
-  lit_utf8_byte_t *write_p = current_p;
+  const lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *end_p = token_p->end_p;
+  bool has_escape_sequence = false;
+  lit_utf8_size_t buffer_size = 0;

-  token_p->u.string.start_p = current_p;
-
-  while (*current_p != LIT_CHAR_DOUBLE_QUOTE)
+  /* First step: syntax checking. */
+  while (true)
  {
-    if (*current_p <= 0x1f)
+    if (current_p >= end_p || *current_p <= 0x1f)
    {
      return;
    }
+
+    if (*current_p == LIT_CHAR_DOUBLE_QUOTE)
+    {
+      break;
+    }
+
    if (*current_p == LIT_CHAR_BACKSLASH)
    {
      current_p++;
+      has_escape_sequence = true;
+
      switch (*current_p)
      {
        case LIT_CHAR_DOUBLE_QUOTE:
        case LIT_CHAR_SLASH:
        case LIT_CHAR_BACKSLASH:
-        {
-          break;
-        }
        case LIT_CHAR_LOWERCASE_B:
-        {
-          *current_p = LIT_CHAR_BS;
-          break;
-        }
        case LIT_CHAR_LOWERCASE_F:
-        {
-          *current_p = LIT_CHAR_FF;
-          break;
-        }
        case LIT_CHAR_LOWERCASE_N:
-        {
-          *current_p = LIT_CHAR_LF;
-          break;
-        }
        case LIT_CHAR_LOWERCASE_R:
-        {
-          *current_p = LIT_CHAR_CR;
-          break;
-        }
        case LIT_CHAR_LOWERCASE_T:
        {
-          *current_p = LIT_CHAR_TAB;
          break;
        }
        case LIT_CHAR_LOWERCASE_U:
@@ -185,7 +184,9 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
          }

          current_p += 5;
-          write_p += lit_code_unit_to_utf8 (code_unit, write_p);
+
+          lit_utf8_byte_t char_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_UNIT];
+          buffer_size += lit_code_unit_to_utf8 (code_unit, char_buffer);
          continue;
        }
        default:
@@ -194,12 +195,92 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
        }
      }
    }
+
+    buffer_size++;
+    current_p++;
+  }
+
+  token_p->type = string_token;
+
+  if (!has_escape_sequence)
+  {
+    token_p->u.string_p = ecma_new_ecma_string_from_utf8 (token_p->current_p, buffer_size);
+    token_p->current_p = current_p + 1;
+    return;
+  }
+
+  JMEM_DEFINE_LOCAL_ARRAY (buffer_p, buffer_size, lit_utf8_byte_t);
+
+  lit_utf8_byte_t *write_p = buffer_p;
+  current_p = token_p->current_p;
+
+  while (*current_p != LIT_CHAR_DOUBLE_QUOTE)
+  {
+    if (*current_p == LIT_CHAR_BACKSLASH)
+    {
+      current_p++;
+
+      lit_utf8_byte_t special_character;
+
+      switch (*current_p)
+      {
+        case LIT_CHAR_LOWERCASE_B:
+        {
+          special_character = LIT_CHAR_BS;
+          break;
+        }
+        case LIT_CHAR_LOWERCASE_F:
+        {
+          special_character = LIT_CHAR_FF;
+          break;
+        }
+        case LIT_CHAR_LOWERCASE_N:
+        {
+          special_character = LIT_CHAR_LF;
+          break;
+        }
+        case LIT_CHAR_LOWERCASE_R:
+        {
+          special_character = LIT_CHAR_CR;
+          break;
+        }
+        case LIT_CHAR_LOWERCASE_T:
+        {
+          special_character = LIT_CHAR_TAB;
+          break;
+        }
+        case LIT_CHAR_LOWERCASE_U:
+        {
+          ecma_char_t code_unit;
+
+          lit_read_code_unit_from_hex (current_p + 1, 4, &code_unit);
+
+          current_p += 5;
+          write_p += lit_code_unit_to_utf8 (code_unit, write_p);
+          continue;
+        }
+        default:
+        {
+          special_character = *current_p;
+          break;
+        }
+      }
+
+      *write_p++ = special_character;
+      current_p++;
+      continue;
+    }
+
    *write_p++ = *current_p++;
  }

-  token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
+  JERRY_ASSERT (write_p == buffer_p + buffer_size);
+
+  token_p->u.string_p = ecma_new_ecma_string_from_utf8 (buffer_p, buffer_size);
+
+  JMEM_FINALIZE_LOCAL_ARRAY (buffer_p);
+
  token_p->current_p = current_p + 1;
-  token_p->type = string_token;
 } /* ecma_builtin_json_parse_string */

 /**
@@ -208,18 +289,27 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
 static void
 ecma_builtin_json_parse_number (ecma_json_token_t *token_p) /**< token argument */
 {
-  lit_utf8_byte_t *current_p = token_p->current_p;
-  lit_utf8_byte_t *start_p = current_p;
+  const lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *end_p = token_p->end_p;
+  const lit_utf8_byte_t *start_p = current_p;
+
+  JERRY_ASSERT (current_p < end_p);

  if (*current_p == LIT_CHAR_MINUS)
  {
    current_p++;
  }

+  if (current_p >= end_p)
+  {
+    return;
+  }
+
  if (*current_p == LIT_CHAR_0)
  {
    current_p++;
-    if (lit_char_is_decimal_digit (*current_p))
+
+    if (current_p < end_p && lit_char_is_decimal_digit (*current_p))
    {
      return;
    }
@@ -230,13 +320,14 @@ ecma_builtin_json_parse_number (ecma_json_token_t *token_p) /**< token argument
    {
      current_p++;
    }
-    while (lit_char_is_decimal_digit (*current_p));
+    while (current_p < end_p && lit_char_is_decimal_digit (*current_p));
  }

-  if (*current_p == LIT_CHAR_DOT)
+  if (current_p < end_p && *current_p == LIT_CHAR_DOT)
  {
    current_p++;
-    if (!lit_char_is_decimal_digit (*current_p))
+
+    if (current_p >= end_p || !lit_char_is_decimal_digit (*current_p))
    {
      return;
    }
@@ -245,18 +336,19 @@ ecma_builtin_json_parse_number (ecma_json_token_t *token_p) /**< token argument
    {
      current_p++;
    }
-    while (lit_char_is_decimal_digit (*current_p));
+    while (current_p < end_p && lit_char_is_decimal_digit (*current_p));
  }

-  if (*current_p == LIT_CHAR_LOWERCASE_E || *current_p == LIT_CHAR_UPPERCASE_E)
+  if (current_p < end_p && (*current_p == LIT_CHAR_LOWERCASE_E || *current_p == LIT_CHAR_UPPERCASE_E))
  {
    current_p++;
-    if (*current_p == LIT_CHAR_PLUS || *current_p == LIT_CHAR_MINUS)
+
+    if (current_p < end_p && (*current_p == LIT_CHAR_PLUS || *current_p == LIT_CHAR_MINUS))
    {
      current_p++;
    }

-    if (!lit_char_is_decimal_digit (*current_p))
+    if (current_p >= end_p || !lit_char_is_decimal_digit (*current_p))
    {
      return;
    }
@@ -265,8 +357,9 @@ ecma_builtin_json_parse_number (ecma_json_token_t *token_p) /**< token argument
    {
      current_p++;
    }
-    while (lit_char_is_decimal_digit (*current_p));
+    while (current_p < end_p && lit_char_is_decimal_digit (*current_p));
  }
+
  token_p->type = number_token;
  token_p->u.number = ecma_utf8_string_to_number (start_p, (lit_utf8_size_t) (current_p - start_p));

@@ -280,12 +373,14 @@ ecma_builtin_json_parse_number (ecma_json_token_t *token_p) /**< token argument
 * argument and advances the string pointer.
 */
 static void
-ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argument */
+ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p, /**< token argument */
+                                    bool parse_string) /**< strings are allowed to parse */
 {
-  lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *end_p = token_p->end_p;
  token_p->type = invalid_token;

-  while (current_p < token_p->end_p
+  while (current_p < end_p
         && (*current_p == LIT_CHAR_SP
             || *current_p == LIT_CHAR_CR
             || *current_p == LIT_CHAR_LF
@@ -294,7 +389,7 @@ ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argum
    current_p++;
  }

-  if (current_p == token_p->end_p)
+  if (current_p == end_p)
  {
    token_p->type = end_token;
    return;
@@ -334,13 +429,16 @@ ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argum
    }
    case LIT_CHAR_DOUBLE_QUOTE:
    {
-      token_p->current_p = current_p + 1;
-      ecma_builtin_json_parse_string (token_p);
+      if (parse_string)
+      {
+        token_p->current_p = current_p + 1;
+        ecma_builtin_json_parse_string (token_p);
+      }
      return;
    }
    case LIT_CHAR_LOWERCASE_N:
    {
-      if (ecma_builtin_json_check_id (current_p, "null"))
+      if (ecma_builtin_json_check_id (current_p, token_p->end_p, "null"))
      {
        token_p->type = null_token;
        token_p->current_p = current_p + 4;
@@ -350,7 +448,7 @@ ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argum
    }
    case LIT_CHAR_LOWERCASE_T:
    {
-      if (ecma_builtin_json_check_id (current_p, "true"))
+      if (ecma_builtin_json_check_id (current_p, token_p->end_p, "true"))
      {
        token_p->type = true_token;
        token_p->current_p = current_p + 4;
@@ -360,7 +458,7 @@ ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argum
    }
    case LIT_CHAR_LOWERCASE_F:
    {
-      if (ecma_builtin_json_check_id (current_p, "false"))
+      if (ecma_builtin_json_check_id (current_p, token_p->end_p, "false"))
      {
        token_p->type = false_token;
        token_p->current_p = current_p + 5;
@@ -391,24 +489,29 @@ ecma_builtin_json_parse_next_token (ecma_json_token_t *token_p) /**< token argum
 static bool
 ecma_builtin_json_check_right_square_token (ecma_json_token_t *token_p) /**< token argument */
 {
-  lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *current_p = token_p->current_p;
+  const lit_utf8_byte_t *end_p = token_p->end_p;

  /*
   * No need for end check since the string is zero terminated.
   */
-  while (*current_p == LIT_CHAR_SP || *current_p == LIT_CHAR_CR
-         || *current_p == LIT_CHAR_LF || *current_p == LIT_CHAR_TAB)
+  while (current_p < end_p
+         && (*current_p == LIT_CHAR_SP
+             || *current_p == LIT_CHAR_CR
+             || *current_p == LIT_CHAR_LF
+             || *current_p == LIT_CHAR_TAB))
  {
    current_p++;
  }

  token_p->current_p = current_p;

-  if (*current_p == LIT_CHAR_RIGHT_SQUARE)
+  if (current_p < end_p && *current_p == LIT_CHAR_RIGHT_SQUARE)
  {
    token_p->current_p = current_p + 1;
    return true;
  }
+
  return false;
 } /* ecma_builtin_json_check_right_square_token */

@@ -444,7 +547,7 @@ ecma_builtin_json_define_value_property (ecma_object_t *obj_p, /**< this object
 static ecma_value_t
 ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument */
 {
-  ecma_builtin_json_parse_next_token (token_p);
+  ecma_builtin_json_parse_next_token (token_p, true);

  switch (token_p->type)
  {
@@ -454,8 +557,7 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *
    }
    case string_token:
    {
-      ecma_string_t *string_p = ecma_new_ecma_string_from_utf8 (token_p->u.string.start_p, token_p->u.string.size);
-      return ecma_make_string_value (string_p);
+      return ecma_make_string_value (token_p->u.string_p);
    }
    case null_token:
    {
@@ -476,7 +578,7 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *

      while (true)
      {
-        ecma_builtin_json_parse_next_token (token_p);
+        ecma_builtin_json_parse_next_token (token_p, !parse_comma);

        if (token_p->type == right_brace_token)
        {
@@ -489,7 +591,8 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *
          {
            break;
          }
-          ecma_builtin_json_parse_next_token (token_p);
+
+          ecma_builtin_json_parse_next_token (token_p, true);
        }

        if (token_p->type != string_token)
@@ -497,12 +600,13 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *
          break;
        }

-        const lit_utf8_byte_t *string_start_p = token_p->u.string.start_p;
-        lit_utf8_size_t string_size = token_p->u.string.size;
-        ecma_builtin_json_parse_next_token (token_p);
+        ecma_string_t *name_p = token_p->u.string_p;
+
+        ecma_builtin_json_parse_next_token (token_p, false);

        if (token_p->type != colon_token)
        {
+          ecma_deref_ecma_string (name_p);
          break;
        }

@@ -510,13 +614,14 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *

        if (ecma_is_value_undefined (value))
        {
+          ecma_deref_ecma_string (name_p);
          break;
        }

-        ecma_string_t *name_p = ecma_new_ecma_string_from_utf8 (string_start_p, string_size);
        ecma_builtin_json_define_value_property (object_p, name_p, value);
        ecma_deref_ecma_string (name_p);
        ecma_free_value (value);
+
        parse_comma = true;
      }

@@ -545,7 +650,8 @@ ecma_builtin_json_parse_value (ecma_json_token_t *token_p) /**< token argument *

        if (parse_comma)
        {
-          ecma_builtin_json_parse_next_token (token_p);
+          ecma_builtin_json_parse_next_token (token_p, false);
+
          if (token_p->type != comma_token)
          {
            break;
@@ -707,15 +813,8 @@ ecma_builtin_json_parse (ecma_value_t this_arg, /**< 'this' argument */
                  ret_value);

  const ecma_string_t *string_p = ecma_get_string_from_value (string);
-  const ecma_length_t string_size = (ecma_length_t) ecma_string_get_size (string_p);
-  const lit_utf8_size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1);

-  JMEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t);
-
-  const lit_utf8_size_t sz = ecma_string_copy_to_utf8_buffer (string_p, str_start_p, buffer_size);
-  JERRY_ASSERT (sz == string_size);
-
-  str_start_p[string_size] = LIT_BYTE_NULL;
+  ECMA_STRING_TO_UTF8_STRING (string_p, str_start_p, string_size);

  ecma_json_token_t token;
  token.current_p = str_start_p;
@@ -725,7 +824,7 @@ ecma_builtin_json_parse (ecma_value_t this_arg, /**< 'this' argument */

  if (!ecma_is_value_undefined (final_result))
  {
-    ecma_builtin_json_parse_next_token (&token);
+    ecma_builtin_json_parse_next_token (&token, false);

    if (token.type != end_token)
    {
@@ -736,7 +835,7 @@ ecma_builtin_json_parse (ecma_value_t this_arg, /**< 'this' argument */

  if (ecma_is_value_undefined (final_result))
  {
-    ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Could not parse JSON string."));
+    ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("JSON string parse error."));
  }
  else
  {
@@ -766,7 +865,7 @@ ecma_builtin_json_parse (ecma_value_t this_arg, /**< 'this' argument */
    }
  }

-  JMEM_FINALIZE_LOCAL_ARRAY (str_start_p);
+  ECMA_FINALIZE_UTF8_STRING (str_start_p, string_size);

  ECMA_FINALIZE (string);
  return ret_value;