Add \u parse support for the JSON object. Buffer overrun issues were fixed as well.

JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu
2015-07-20 01:03:51 -07:00
parent bbfddea032
commit bcedc901cd
6 changed files with 144 additions and 59 deletions
@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
        {
          break;
        }
+        case 'b':
+        {
+          *current_p = '\b';
+          break;
+        }
        case 'f':
        {
          *current_p = '\f';
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
          *current_p = '\t';
          break;
        }
-        case 'b':
+        case 'u':
        {
-          *current_p = '\b';
-          break;
+          lit_code_point_t code_point;
+
+          if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
+          {
+            return;
+          }
+
+          current_p += 5;
+          write_p += lit_code_point_to_utf8 (code_point, write_p);
+          continue;
+          /* FALLTHRU */
        }
        default:
        {
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
    *write_p++ = *current_p++;
  }

+  /*
+   * Post processing surrogate pairs.
+   *
+   * The general issue is, that surrogate fragments can come from
+   * the original stream and can be constructed by \u sequences
+   * as well. We need to construct code points from them.
+   *
+   * Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
+   *          The first \u is parsed by JSON, the second is by the lexer.
+   *
+   * The rewrite happens in-place, since the write pointer is always
+   * precede the read-pointer. We also cannot create an UTF8 iterator,
+   * because the lit_is_utf8_string_valid assertion may fail.
+   */
+
+  lit_utf8_byte_t *read_p = token_p->u.string.start_p;
+  lit_utf8_byte_t *read_end_p = write_p;
+  write_p = read_p;
+
+  while (read_p < read_end_p)
+  {
+    lit_code_point_t code_point;
+    read_p += lit_read_code_point_from_utf8 (read_p,
+                                             (lit_utf8_size_t) (read_end_p - read_p),
+                                             &code_point);
+
+    /* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
+       so code_points above maximum UTF16 code unit must not be tested. */
+    if (read_p < read_end_p
+        && code_point <= LIT_UTF16_CODE_UNIT_MAX
+        && lit_is_code_unit_high_surrogate ((ecma_char_t) code_point))
+    {
+      lit_code_point_t next_code_point;
+      lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
+                                                                            (lit_utf8_size_t) (read_end_p - read_p),
+                                                                            &next_code_point);
+
+      if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
+          && lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point))
+      {
+        code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point,
+                                                               (ecma_char_t) next_code_point);
+        read_p += next_code_point_size;
+      }
+    }
+    write_p += lit_code_point_to_utf8 (code_point, write_p);
+  }
+
+  JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p,
+                                          (lit_utf8_size_t) (write_p - token_p->u.string.start_p)));
+
  token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
  token_p->current_p = current_p + 1;
  token_p->type = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
                  ret_value);

  ecma_string_t *string_p = ecma_get_string_from_value (string);
-  ecma_length_t length = (uint32_t) ecma_string_get_length (string_p);
-  size_t buffer_size = sizeof (lit_utf8_byte_t) * (length + 1);
+  ecma_length_t string_size = (uint32_t) ecma_string_get_size (string_p);
+  size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1);

  MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t);

  ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t) buffer_size);
-  str_start_p[length] = LIT_BYTE_NULL;
+  str_start_p[string_size] = LIT_BYTE_NULL;

  ecma_json_token_t token;
  token.current_p = str_start_p;
-  token.end_p = str_start_p + length;
+  token.end_p = str_start_p + string_size;

  ecma_value_t final_result = ecma_builtin_json_parse_value (&token);