From bcedc901cda5227a374ad530f67f754d44b12949 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 20 Jul 2015 01:03:51 -0700 Subject: [PATCH] Add \u parse support for the JSON object. Buffer overrun issues were fixed as well. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu --- .../builtin-objects/ecma-builtin-global.cpp | 63 +++------------ .../builtin-objects/ecma-builtin-json.cpp | 79 +++++++++++++++++-- jerry-core/lit/lit-char-helpers.cpp | 46 +++++++++++ jerry-core/lit/lit-char-helpers.h | 3 + jerry-core/lit/lit-strings.cpp | 1 + tests/jerry/json-parse.js | 11 +++ 6 files changed, 144 insertions(+), 59 deletions(-) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp index b4cd6d9eb..bedfec337 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp @@ -24,6 +24,7 @@ #include "ecma-helpers.h" #include "ecma-try-catch-macro.h" #include "jrt.h" +#include "lit-char-helpers.h" #include "lit-magic-strings.h" #include "lit-strings.h" #include "vm.h" @@ -517,53 +518,6 @@ static uint8_t unescaped_uri_component_set[16] = */ #define URI_ENCODED_BYTE_SIZE (3) -#define ECMA_BUILTIN_HEX_TO_BYTE_ERROR (0x100) - -/** - * Helper function to decode a hexadecimal byte from a string. - * - * @return the decoded byte value - * It returns with ECMA_BUILTIN_HEX_TO_BYTE_ERROR if a parse error is occured. - */ -static uint32_t -ecma_builtin_global_object_hex_to_byte (lit_utf8_byte_t *source_p) /**< source string */ -{ - uint32_t decoded_byte = 0; - - /* - * Zero terminated string, so length check is not needed. - */ - if (*source_p != '%') - { - return ECMA_BUILTIN_HEX_TO_BYTE_ERROR; - } - - for (lit_utf8_size_t i = 0; i < 2; i++) - { - source_p++; - decoded_byte <<= 4; - - if (*source_p >= '0' && *source_p <= '9') - { - decoded_byte |= (uint32_t) (*source_p - '0'); - } - else if (*source_p >= 'a' && *source_p <= 'f') - { - decoded_byte |= (uint32_t) (*source_p - ('a' - 10)); - } - else if (*source_p >= 'A' && *source_p <= 'F') - { - decoded_byte |= (uint32_t) (*source_p - ('A' - 10)); - } - else - { - return ECMA_BUILTIN_HEX_TO_BYTE_ERROR; - } - } - - return decoded_byte; -} /* ecma_builtin_global_object_hex_to_byte */ - /** * Helper function to decode URI. * @@ -586,12 +540,13 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, lit_utf8_size_t input_size = ecma_string_get_size (input_string_p); MEM_DEFINE_LOCAL_ARRAY (input_start_p, - input_size, + input_size + 1, lit_utf8_byte_t); ecma_string_to_utf8_string (input_string_p, input_start_p, (ssize_t) (input_size)); + input_start_p[input_size] = LIT_BYTE_NULL; lit_utf8_byte_t *input_char_p = input_start_p; lit_utf8_byte_t *input_end_p = input_start_p + input_size; @@ -616,8 +571,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, continue; } - uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p); - if (decoded_byte == ECMA_BUILTIN_HEX_TO_BYTE_ERROR) + lit_code_point_t decoded_byte; + + if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte)) { ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI)); break; @@ -667,7 +623,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, continue; } - uint32_t decoded_byte = ecma_builtin_global_object_hex_to_byte (input_char_p); + lit_code_point_t decoded_byte; + + lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte); input_char_p += URI_ENCODED_BYTE_SIZE; if (decoded_byte <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) @@ -704,7 +662,8 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, ecma_char_t character = lit_utf8_iterator_read_next (&characters); /* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */ - if (character >= LIT_UTF16_HIGH_SURROGATE_MIN && character <= LIT_UTF16_LOW_SURROGATE_MAX) + if (lit_is_code_unit_low_surrogate (character) + || lit_is_code_unit_high_surrogate (character)) { valid_utf8 = false; break; diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp index 689728e67..743ed5687 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp @@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument { break; } + case 'b': + { + *current_p = '\b'; + break; + } case 'f': { *current_p = '\f'; @@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument *current_p = '\t'; break; } - case 'b': + case 'u': { - *current_p = '\b'; - break; + lit_code_point_t code_point; + + if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point))) + { + return; + } + + current_p += 5; + write_p += lit_code_point_to_utf8 (code_point, write_p); + continue; + /* FALLTHRU */ } default: { @@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument *write_p++ = *current_p++; } + /* + * Post processing surrogate pairs. + * + * The general issue is, that surrogate fragments can come from + * the original stream and can be constructed by \u sequences + * as well. We need to construct code points from them. + * + * Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00" + * The first \u is parsed by JSON, the second is by the lexer. + * + * The rewrite happens in-place, since the write pointer is always + * precede the read-pointer. We also cannot create an UTF8 iterator, + * because the lit_is_utf8_string_valid assertion may fail. + */ + + lit_utf8_byte_t *read_p = token_p->u.string.start_p; + lit_utf8_byte_t *read_end_p = write_p; + write_p = read_p; + + while (read_p < read_end_p) + { + lit_code_point_t code_point; + read_p += lit_read_code_point_from_utf8 (read_p, + (lit_utf8_size_t) (read_end_p - read_p), + &code_point); + + /* The lit_is_code_unit_high_surrogate expects ecma_char_t argument + so code_points above maximum UTF16 code unit must not be tested. */ + if (read_p < read_end_p + && code_point <= LIT_UTF16_CODE_UNIT_MAX + && lit_is_code_unit_high_surrogate ((ecma_char_t) code_point)) + { + lit_code_point_t next_code_point; + lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p, + (lit_utf8_size_t) (read_end_p - read_p), + &next_code_point); + + if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX + && lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point)) + { + code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point, + (ecma_char_t) next_code_point); + read_p += next_code_point_size; + } + } + write_p += lit_code_point_to_utf8 (code_point, write_p); + } + + JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p, + (lit_utf8_size_t) (write_p - token_p->u.string.start_p))); + token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p); token_p->current_p = current_p + 1; token_p->type = string_token; @@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg ret_value); ecma_string_t *string_p = ecma_get_string_from_value (string); - ecma_length_t length = (uint32_t) ecma_string_get_length (string_p); - size_t buffer_size = sizeof (lit_utf8_byte_t) * (length + 1); + ecma_length_t string_size = (uint32_t) ecma_string_get_size (string_p); + size_t buffer_size = sizeof (lit_utf8_byte_t) * (string_size + 1); MEM_DEFINE_LOCAL_ARRAY (str_start_p, buffer_size, lit_utf8_byte_t); ecma_string_to_utf8_string (string_p, str_start_p, (ssize_t) buffer_size); - str_start_p[length] = LIT_BYTE_NULL; + str_start_p[string_size] = LIT_BYTE_NULL; ecma_json_token_t token; token.current_p = str_start_p; - token.end_p = str_start_p + length; + token.end_p = str_start_p + string_size; ecma_value_t final_result = ecma_builtin_json_parse_value (&token); diff --git a/jerry-core/lit/lit-char-helpers.cpp b/jerry-core/lit/lit-char-helpers.cpp index 76b2a2a21..0f83bc05a 100644 --- a/jerry-core/lit/lit-char-helpers.cpp +++ b/jerry-core/lit/lit-char-helpers.cpp @@ -312,6 +312,52 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to } } /* lit_char_hex_to_int */ +/** + * Parse the next number_of_characters hexadecimal character, + * and construct a code point from them. The buffer must + * be zero terminated. + * + * @return true if decoding was successful, false otherwise + */ +bool +lit_read_code_point_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */ + lit_utf8_size_t number_of_characters, /**< number of characters to be read */ + lit_code_point_t *out_code_point_p) /**< @out: decoded result */ +{ + lit_code_point_t code_point = 0; + + JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4); + + for (lit_utf8_size_t i = 0; i < number_of_characters; i++) + { + code_point <<= 4; + + if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN + && *buf_p <= LIT_CHAR_ASCII_DIGITS_END) + { + code_point |= (uint32_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN); + } + else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END) + { + code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10)); + } + else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END) + { + code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10)); + } + else + { + return false; + } + + buf_p++; + } + *out_code_point_p = code_point; + return true; +} /* lit_read_code_point_from_hex */ + /** * Check if specified character is a word character (part of IsWordChar abstract operation) * diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index dc6f48731..66af62985 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -210,6 +210,9 @@ extern bool lit_char_is_decimal_digit (ecma_char_t); extern bool lit_char_is_hex_digit (ecma_char_t); extern uint32_t lit_char_hex_to_int (ecma_char_t); +/* read a hex encoded code point from a zero terminated buffer */ +bool lit_read_code_point_from_hex (const lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *); + /** * Null character */ diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp index 9f1c44622..c5bfc9d84 100644 --- a/jerry-core/lit/lit-strings.cpp +++ b/jerry-core/lit/lit-strings.cpp @@ -73,6 +73,7 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string * lit_utf8_byte_t c = utf8_buf_p[idx++]; if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) { + is_prev_code_point_high_surrogate = false; continue; } diff --git a/tests/jerry/json-parse.js b/tests/jerry/json-parse.js index c15ad599e..8ef422b27 100644 --- a/tests/jerry/json-parse.js +++ b/tests/jerry/json-parse.js @@ -40,6 +40,14 @@ str = '"str"'; assert (JSON.parse (str) == "str"); str = '"\\b\\f\\n\\t\\r"' assert (JSON.parse (str) === "\b\f\n\t\r"); +/* Note: \u is parsed by the lexer, \\u is by the JSON parser. */ +str = '"\\u0000\\u001f"'; +assert (JSON.parse (str) === "\x00\x1f"); +str = '"\\ud801\\udc00\\ud801\udc00\ud801\\udc00\ud801\udc00"'; +assert (JSON.parse (str) === "\ud801\udc00\ud801\udc00\ud801\udc00\ud801\udc00"); +/* These surrogates do not form a valid surrogate pairs. */ +str = '"\\ud801,\\udc00,\\ud801,\udc00,\ud801,\\udc00,\ud801,\udc00"'; +assert (JSON.parse (str) === "\ud801,\udc00,\ud801,\udc00,\ud801,\udc00,\ud801,\udc00"); check_parse_error ('undefined'); check_parse_error ('falses'); @@ -52,6 +60,9 @@ check_parse_error ('3e+a'); check_parse_error ('55e4,'); check_parse_error ('5 true'); check_parse_error ("'str'"); +check_parse_error ('\x00'); +check_parse_error ('"\x00"'); +check_parse_error ('"\x1f"'); // Checking objects str = ' { "x": 0, "yy": null, "zzz": { "A": 4.0, "BB": { "1": 63e-1 }, "CCC" : false } } ';