Add \u parse support for the JSON object. Buffer overrun issues were fixed as well.

JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu
This commit is contained in:
Zoltan Herczeg
2015-07-20 01:03:51 -07:00
parent bbfddea032
commit bcedc901cd
6 changed files with 144 additions and 59 deletions
@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
{
break;
}
case 'b':
{
*current_p = '\b';
break;
}
case 'f':
{
*current_p = '\f';
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
*current_p = '\t';
break;
}
case 'b':
case 'u':
{
*current_p = '\b';
break;
lit_code_point_t code_point;
if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
{
return;
}
current_p += 5;
write_p += lit_code_point_to_utf8 (code_point, write_p);
continue;
/* FALLTHRU */
}
default:
{
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
*write_p++ = *current_p++;
}
/*
* Post processing surrogate pairs.
*
* The general issue is, that surrogate fragments can come from
* the original stream and can be constructed by \u sequences
* as well. We need to construct code points from them.
*
* Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
* The first \u is parsed by JSON, the second is by the lexer.
*
* The rewrite happens in-place, since the write pointer is always
* precede the read-pointer. We also cannot create an UTF8 iterator,
* because the lit_is_utf8_string_valid assertion may fail.
*/
lit_utf8_byte_t *read_p = token_p->u.string.start_p;
lit_utf8_byte_t *read_end_p = write_p;
write_p = read_p;
while (read_p < read_end_p)
{
lit_code_point_t code_point;
read_p += lit_read_code_point_from_utf8 (read_p,
(lit_utf8_size_t) (read_end_p - read_p),
&code_point);
/* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
so code_points above maximum UTF16 code unit must not be tested. */
if (read_p < read_end_p
&& code_point <= LIT_UTF16_CODE_UNIT_MAX
&& lit_is_code_unit_high_surrogate ((ecma_char_t) code_point))
{
lit_code_point_t next_code_point;
lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
(lit_utf8_size_t) (read_end_p - read_p),
&next_code_point);
if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
&& lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point))
{
code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point,
(ecma_char_t) next_code_point);
read_p += next_code_point_size;
}
}
write_p += lit_code_point_to_utf8 (code_point, write_p);
}
JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p,
(lit_utf8_size_t) (write_p - token_p->u.string.start_p)));
token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
token_p->current_p = current_p + 1;
token_p->type = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
ret_value);
ecma_string_t *string_p = ecma_get_string_from_value (string);
ecma_length_t length = (uint32_t) ecma_string_get_length (string_p);
size_t buffer_size = sizeof (lit_utf8_byte_t) * (length + 1);
ecma_length_t string_size = (uint32_t) ecma_string_get_size (string_p);
size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1);
MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t);
ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t) buffer_size);
str_start_p[length] = LIT_BYTE_NULL;
str_start_p[string_size] = LIT_BYTE_NULL;
ecma_json_token_t token;
token.current_p = str_start_p;
token.end_p = str_start_p + length;
token.end_p = str_start_p + string_size;
ecma_value_t final_result = ecma_builtin_json_parse_value (&token);