From 2630048eccab104d00146c7aabdfea985663da82 Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Mon, 3 Aug 2015 06:27:26 -0700 Subject: [PATCH] Fixes for URI decoding. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg@inf.u-szeged.hu --- .../builtin-objects/ecma-builtin-global.cpp | 43 ++++++++++++++++--- tests/jerry/global-uri-coding.js | 9 ++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp index 81621a877..2d680df26 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.cpp @@ -747,6 +747,13 @@ static uint8_t unescaped_uri_component_set[16] = */ #define URI_ENCODED_BYTE_SIZE (3) +/* + * These two types shows whether the byte is present in + * the original stream or decoded from a %xx sequence. + */ +#define URI_DECODE_ORIGINAL_BYTE 0 +#define URI_DECODE_DECODED_BYTE 1 + /** * Helper function to decode URI. * @@ -835,23 +842,27 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, if (ecma_is_completion_value_empty (ret_value)) { MEM_DEFINE_LOCAL_ARRAY (output_start_p, - output_size, + output_size * 2, lit_utf8_byte_t); input_char_p = input_start_p; lit_utf8_byte_t *output_char_p = output_start_p; + lit_utf8_byte_t *output_type_p = output_start_p + output_size; while (input_char_p < input_end_p) { /* Input decode. */ if (*input_char_p != '%') { + *output_type_p++ = URI_DECODE_ORIGINAL_BYTE; *output_char_p = *input_char_p; output_char_p++; input_char_p++; continue; } + *output_type_p++ = URI_DECODE_DECODED_BYTE; + lit_code_point_t decoded_byte; lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte); @@ -886,16 +897,38 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___, if (valid_utf8) { lit_utf8_iterator_t characters = lit_utf8_iterator_create (output_start_p, output_size); + output_type_p = output_start_p + output_size; + while (!lit_utf8_iterator_is_eos (&characters)) { + bool original_byte = output_type_p[characters.buf_pos.offset] == URI_DECODE_ORIGINAL_BYTE; + ecma_char_t character = lit_utf8_iterator_read_next (&characters); /* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */ - if (lit_is_code_unit_low_surrogate (character) - || lit_is_code_unit_high_surrogate (character)) + if (!original_byte) { - valid_utf8 = false; - break; + if (lit_is_code_unit_high_surrogate (character)) + { + /* Note: stray high/low surrogate pairs are not allowed in the stream. */ + if (lit_utf8_iterator_is_eos (&characters)) + { + valid_utf8 = false; + break; + } + + if (output_type_p[characters.buf_pos.offset] == URI_DECODE_ORIGINAL_BYTE + || !lit_is_code_unit_low_surrogate (lit_utf8_iterator_read_next (&characters))) + { + valid_utf8 = false; + break; + } + } + else if (lit_is_code_unit_low_surrogate (character)) + { + valid_utf8 = false; + break; + } } } } diff --git a/tests/jerry/global-uri-coding.js b/tests/jerry/global-uri-coding.js index 5f7cc024c..066c7c77e 100644 --- a/tests/jerry/global-uri-coding.js +++ b/tests/jerry/global-uri-coding.js @@ -120,3 +120,12 @@ assert (decodeURI ({ x:1 }) === "[object Object]"); assert (encodeURI (void 0) === "undefined"); assert (encodeURI (216.000e1) === "2160"); +// Combining surrogate fragments + +assert (decodeURI("\ud800\udc00 \ud800 \udc00") === "\ud800\udc00 \ud800 \udc00"); +assert (decodeURI("%f0%90%80%80") === "\ud800\udc00"); +assert (decodeURI("\ud800%f0%90%80%80\ud800") === "\ud800\ud800\udc00\ud800"); +assert (decodeURI("\udc00%f0%90%80%80\udc00") === "\udc00\ud800\udc00\udc00"); + +checkDecodeURIParseError ("\ud800%ed%b0%80"); +checkDecodeURIParseError ("%ed%a0%80\udc00");