From bcedc901cda5227a374ad530f67f754d44b12949 Mon Sep 17 00:00:00 2001
From: Zoltan Herczeg <zherczeg@inf.u-szeged.hu>
Date: Mon, 20 Jul 2015 01:03:51 -0700
Subject: [PATCH] Add \u parse support for the JSON object. Buffer overrun
 issues were fixed as well.

JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu
---
 .../builtin-objects/ecma-builtin-global.cpp   | 63 +++------------
 .../builtin-objects/ecma-builtin-json.cpp     | 79 +++++++++++++++++--
 jerry-core/lit/lit-char-helpers.cpp           | 46 +++++++++++
 jerry-core/lit/lit-char-helpers.h             |  3 +
 jerry-core/lit/lit-strings.cpp                |  1 +
 tests/jerry/json-parse.js                     | 11 +++
 6 files changed, 144 insertions(+), 59 deletions(-)

diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp
index b4cd6d9eb..bedfec337 100644
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp
@@ -24,6 +24,7 @@
 #include "ecma-helpers.h"
 #include "ecma-try-catch-macro.h"
 #include "jrt.h"
+#include "lit-char-helpers.h"
 #include "lit-magic-strings.h"
 #include "lit-strings.h"
 #include "vm.h"
@@ -517,53 +518,6 @@ static uint8_t unescaped_uri_component_set[16] =
  */
 #define URI_ENCODED_BYTE_SIZE (3)
 
-#define ECMA_BUILTIN_HEX_TO_BYTE_ERROR (0x100)
-
-/**
- * Helper function to decode a hexadecimal byte from a string.
- *
- * @return the decoded byte value
- *         It returns with ECMA_BUILTIN_HEX_TO_BYTE_ERROR if a parse error is occured.
- */
-static uint32_t
-ecma_builtin_global_object_hex_to_byte (lit_utf8_byte_t *source_p) /**< source string */
-{
-  uint32_t decoded_byte = 0;
-
-  /*
-   * Zero terminated string, so length check is not needed.
-   */
-  if (*source_p != '%')
-  {
-    return ECMA_BUILTIN_HEX_TO_BYTE_ERROR;
-  }
-
-  for (lit_utf8_size_t i = 0; i < 2; i++)
-  {
-    source_p++;
-    decoded_byte <<= 4;
-
-    if (*source_p >= '0' && *source_p <= '9')
-    {
-      decoded_byte |= (uint32_t) (*source_p - '0');
-    }
-    else if (*source_p >= 'a' && *source_p <= 'f')
-    {
-      decoded_byte |= (uint32_t) (*source_p - ('a' - 10));
-    }
-    else if (*source_p >= 'A' && *source_p <= 'F')
-    {
-      decoded_byte |= (uint32_t) (*source_p - ('A' - 10));
-    }
-    else
-    {
-      return ECMA_BUILTIN_HEX_TO_BYTE_ERROR;
-    }
-  }
-
-  return decoded_byte;
-} /* ecma_builtin_global_object_hex_to_byte */
-
 /**
  * Helper function to decode URI.
  *
@@ -586,12 +540,13 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
   lit_utf8_size_t input_size = ecma_string_get_size (input_string_p);
 
   MEM_DEFINE_LOCAL_ARRAY (input_start_p,
-                          input_size,
+                          input_size + 1,
                           lit_utf8_byte_t);
 
   ecma_string_to_utf8_string (input_string_p,
                               input_start_p,
                               (ssize_t) (input_size));
+  input_start_p[input_size] = LIT_BYTE_NULL;
 
   lit_utf8_byte_t *input_char_p = input_start_p;
   lit_utf8_byte_t *input_end_p = input_start_p + input_size;
@@ -616,8 +571,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
       continue;
     }
 
-    uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
-    if (decoded_byte == ECMA_BUILTIN_HEX_TO_BYTE_ERROR)
+    lit_code_point_t decoded_byte;
+
+    if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
     {
       ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
       break;
@@ -667,7 +623,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
         continue;
       }
 
-      uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p);
+      lit_code_point_t decoded_byte;
+
+      lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte);
       input_char_p += URI_ENCODED_BYTE_SIZE;
 
       if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
@@ -704,7 +662,8 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
         ecma_char_t character = lit_utf8_iterator_read_next (&characters);
 
         /* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */
-        if (character >= LIT_UTF16_HIGH_SURROGATE_MIN && character <= LIT_UTF16_LOW_SURROGATE_MAX)
+        if (lit_is_code_unit_low_surrogate (character)
+            || lit_is_code_unit_high_surrogate (character))
         {
           valid_utf8 = false;
           break;
diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp
index 689728e67..743ed5687 100644
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp
@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
         {
           break;
         }
+        case 'b':
+        {
+          *current_p = '\b';
+          break;
+        }
         case 'f':
         {
           *current_p = '\f';
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
           *current_p = '\t';
           break;
         }
-        case 'b':
+        case 'u':
         {
-          *current_p = '\b';
-          break;
+          lit_code_point_t code_point;
+
+          if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
+          {
+            return;
+          }
+
+          current_p += 5;
+          write_p += lit_code_point_to_utf8 (code_point, write_p);
+          continue;
+          /* FALLTHRU */
         }
         default:
         {
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
     *write_p++ = *current_p++;
   }
 
+  /*
+   * Post processing surrogate pairs.
+   *
+   * The general issue is, that surrogate fragments can come from
+   * the original stream and can be constructed by \u sequences
+   * as well. We need to construct code points from them.
+   *
+   * Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
+   *          The first \u is parsed by JSON, the second is by the lexer.
+   *
+   * The rewrite happens in-place, since the write pointer is always
+   * precede the read-pointer. We also cannot create an UTF8 iterator,
+   * because the lit_is_utf8_string_valid assertion may fail.
+   */
+
+  lit_utf8_byte_t *read_p = token_p->u.string.start_p;
+  lit_utf8_byte_t *read_end_p = write_p;
+  write_p = read_p;
+
+  while (read_p < read_end_p)
+  {
+    lit_code_point_t code_point;
+    read_p += lit_read_code_point_from_utf8 (read_p,
+                                             (lit_utf8_size_t) (read_end_p - read_p),
+                                             &code_point);
+
+    /* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
+       so code_points above maximum UTF16 code unit must not be tested. */
+    if (read_p < read_end_p
+        && code_point <= LIT_UTF16_CODE_UNIT_MAX
+        && lit_is_code_unit_high_surrogate ((ecma_char_t) code_point))
+    {
+      lit_code_point_t next_code_point;
+      lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
+                                                                            (lit_utf8_size_t) (read_end_p - read_p),
+                                                                            &next_code_point);
+
+      if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
+          && lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point))
+      {
+        code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point,
+                                                               (ecma_char_t) next_code_point);
+        read_p += next_code_point_size;
+      }
+    }
+    write_p += lit_code_point_to_utf8 (code_point, write_p);
+  }
+
+  JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p,
+                                          (lit_utf8_size_t) (write_p - token_p->u.string.start_p)));
+
   token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
   token_p->current_p = current_p + 1;
   token_p->type = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
                   ret_value);
 
   ecma_string_t *string_p = ecma_get_string_from_value (string);
-  ecma_length_t length = (uint32_t) ecma_string_get_length (string_p);
-  size_t buffer_size = sizeof (lit_utf8_byte_t) * (length + 1);
+  ecma_length_t string_size = (uint32_t) ecma_string_get_size (string_p);
+  size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1);
 
   MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t);
 
   ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t) buffer_size);
-  str_start_p[length] = LIT_BYTE_NULL;
+  str_start_p[string_size] = LIT_BYTE_NULL;
 
   ecma_json_token_t token;
   token.current_p = str_start_p;
-  token.end_p = str_start_p + length;
+  token.end_p = str_start_p + string_size;
 
   ecma_value_t final_result = ecma_builtin_json_parse_value (&token);
 
diff --git a/jerry-core/lit/lit-char-helpers.cpp b/jerry-core/lit/lit-char-helpers.cpp
index 76b2a2a21..0f83bc05a 100644
--- a/jerry-core/lit/lit-char-helpers.cpp
+++ b/jerry-core/lit/lit-char-helpers.cpp
@@ -312,6 +312,52 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
   }
 } /* lit_char_hex_to_int */
 
+/**
+ * Parse the next number_of_characters hexadecimal character,
+ * and construct a code point from them. The buffer must
+ * be zero terminated.
+ *
+ * @return true if decoding was successful, false otherwise
+ */
+bool
+lit_read_code_point_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
+                              lit_utf8_size_t number_of_characters, /**< number of characters to be read */
+                              lit_code_point_t *out_code_point_p) /**< @out: decoded result */
+{
+  lit_code_point_t code_point = 0;
+
+  JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);
+
+  for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
+  {
+    code_point <<= 4;
+
+    if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
+        && *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
+    {
+      code_point |= (uint32_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
+    }
+    else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
+             && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
+    {
+      code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
+    }
+    else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
+             && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
+    {
+      code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
+    }
+    else
+    {
+      return false;
+    }
+
+    buf_p++;
+  }
+  *out_code_point_p = code_point;
+  return true;
+} /* lit_read_code_point_from_hex */
+
 /**
  * Check if specified character is a word character (part of IsWordChar abstract operation)
  *
diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h
index dc6f48731..66af62985 100644
--- a/jerry-core/lit/lit-char-helpers.h
+++ b/jerry-core/lit/lit-char-helpers.h
@@ -210,6 +210,9 @@ extern bool lit_char_is_decimal_digit (ecma_char_t);
 extern bool lit_char_is_hex_digit (ecma_char_t);
 extern uint32_t lit_char_hex_to_int (ecma_char_t);
 
+/* read a hex encoded code point from a zero terminated buffer */
+bool lit_read_code_point_from_hex (const lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *);
+
 /**
  * Null character
  */
diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp
index 9f1c44622..c5bfc9d84 100644
--- a/jerry-core/lit/lit-strings.cpp
+++ b/jerry-core/lit/lit-strings.cpp
@@ -73,6 +73,7 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
     lit_utf8_byte_t c = utf8_buf_p[idx++];
     if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
     {
+      is_prev_code_point_high_surrogate = false;
       continue;
     }
 
diff --git a/tests/jerry/json-parse.js b/tests/jerry/json-parse.js
index c15ad599e..8ef422b27 100644
--- a/tests/jerry/json-parse.js
+++ b/tests/jerry/json-parse.js
@@ -40,6 +40,14 @@ str = '"str"';
 assert (JSON.parse (str) == "str");
 str = '"\\b\\f\\n\\t\\r"'
 assert (JSON.parse (str) === "\b\f\n\t\r");
+/* Note: \u is parsed by the lexer, \\u is by the JSON parser. */
+str = '"\\u0000\\u001f"';
+assert (JSON.parse (str) === "\x00\x1f");
+str = '"\\ud801\\udc00\\ud801\udc00\ud801\\udc00\ud801\udc00"';
+assert (JSON.parse (str) === "\ud801\udc00\ud801\udc00\ud801\udc00\ud801\udc00");
+/* These surrogates do not form a valid surrogate pairs. */
+str = '"\\ud801,\\udc00,\\ud801,\udc00,\ud801,\\udc00,\ud801,\udc00"';
+assert (JSON.parse (str) === "\ud801,\udc00,\ud801,\udc00,\ud801,\udc00,\ud801,\udc00");
 
 check_parse_error ('undefined');
 check_parse_error ('falses');
@@ -52,6 +60,9 @@ check_parse_error ('3e+a');
 check_parse_error ('55e4,');
 check_parse_error ('5 true');
 check_parse_error ("'str'");
+check_parse_error ('\x00');
+check_parse_error ('"\x00"');
+check_parse_error ('"\x1f"');
 
 // Checking objects
 str = ' { "x": 0, "yy": null, "zzz": { "A": 4.0, "BB": { "1": 63e-1 }, "CCC" : false } } ';