diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index 81d4ac072..b432a9e64 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -17,6 +17,15 @@ #include "jrt-libc-includes.h" +#define LIT_UTF8_SURROGATE_MARKER 0xed /**< utf8 surrogate marker */ +#define LIT_UTF8_HIGH_SURROGATE_MIN 0xa0 /**< utf8 high surrogate minimum */ +#define LIT_UTF8_HIGH_SURROGATE_MAX 0xaf /**< utf8 high surrogate maximum */ +#define LIT_UTF8_LOW_SURROGATE_MIN 0xb0 /**< utf8 low surrogate minimum */ +#define LIT_UTF8_LOW_SURROGATE_MAX 0xbf /**< utf8 low surrogate maximum */ +#define LIT_UTF8_1_BYTE_MAX 0xf4 /**< utf8 one byte max */ +#define LIT_UTF8_2_BYTE_MAX 0x8f /**< utf8 two byte max */ +#define LIT_UTF8_VALID_TWO_BYTE_START 0xc2 /**< utf8 two byte start */ + /** * Validate utf-8 string * @@ -31,89 +40,70 @@ lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string * lit_utf8_size_t buf_size, /**< string size */ bool is_strict) /**< true if surrogate pairs are not allowed */ { - lit_utf8_size_t idx = 0; + const unsigned char *end = buf_size + utf8_buf_p; - bool is_prev_code_point_high_surrogate = false; - while (idx < buf_size) + const unsigned char *idx = (const unsigned char *) utf8_buf_p; + + while (idx < end) { - lit_utf8_byte_t c = utf8_buf_p[idx++]; - if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) + const uint8_t first_byte = *idx++; + + if (first_byte < LIT_UTF8_EXTRA_BYTE_MARKER) { - is_prev_code_point_high_surrogate = false; continue; } - lit_code_point_t code_point = 0; - lit_code_point_t min_code_point = 0; - lit_utf8_size_t extra_bytes_count; - if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) + if (first_byte < LIT_UTF8_VALID_TWO_BYTE_START || idx >= end) { - extra_bytes_count = 1; - min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN; - code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK)); - } - else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) - { - extra_bytes_count = 2; - min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN; - code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK)); - } - else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) - { - extra_bytes_count = 3; - min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN; - code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK)); - } - else - { - /* utf-8 string could not contain 5- and 6-byte sequences. */ return false; } - if (idx + extra_bytes_count > buf_size) + const uint8_t second_byte = *idx++; + + if ((second_byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) { - /* utf-8 string breaks in the middle */ return false; } - for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset) + if (first_byte < LIT_UTF8_3_BYTE_MARKER) { - c = utf8_buf_p[idx + offset]; - if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) + continue; + } + + if (idx >= end || (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) + { + return false; + } + + if (first_byte < LIT_UTF8_4_BYTE_MARKER) + { + if (first_byte == LIT_UTF8_3_BYTE_MARKER && (second_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER) { - /* invalid continuation byte */ return false; } - code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES; - code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK); - } - if (code_point < min_code_point - || code_point > LIT_UNICODE_CODE_POINT_MAX) - { - /* utf-8 string doesn't encode valid unicode code point */ - return false; - } - - if (is_strict) - { - is_prev_code_point_high_surrogate = false; - - if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN - && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) + if (is_strict + && first_byte == LIT_UTF8_SURROGATE_MARKER + && second_byte >= LIT_UTF8_HIGH_SURROGATE_MIN + && second_byte <= LIT_UTF8_HIGH_SURROGATE_MAX + && idx + 3 <= end + && idx[0] == LIT_UTF8_SURROGATE_MARKER + && idx[1] >= LIT_UTF8_LOW_SURROGATE_MIN + && idx[1] <= LIT_UTF8_LOW_SURROGATE_MAX) { - is_prev_code_point_high_surrogate = true; - } - else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN - && code_point <= LIT_UTF16_LOW_SURROGATE_MAX - && is_prev_code_point_high_surrogate) - { - /* sequence of high and low surrogate is not allowed */ return false; } + continue; } - idx += extra_bytes_count; + if (idx >= end + || first_byte > LIT_UTF8_1_BYTE_MAX + || (first_byte == LIT_UTF8_4_BYTE_MARKER && second_byte <= LIT_UTF8_EXTRA_BYTE_MARKER) + || (first_byte == LIT_UTF8_1_BYTE_MAX && second_byte > LIT_UTF8_2_BYTE_MAX) + || (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER) + { + return false; + } } return true; diff --git a/tests/unit-core/test-strings.c b/tests/unit-core/test-strings.c index 7511c8640..0f756fe03 100644 --- a/tests/unit-core/test-strings.c +++ b/tests/unit-core/test-strings.c @@ -215,6 +215,42 @@ main (void) TEST_ASSERT (res_buf[1] == 0x9F); TEST_ASSERT (res_buf[2] == 0xBF); + /* Ascii string */ + lit_utf8_byte_t utf8_string_ascii[] = {'G','o','o','d','b','y','e'}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_ascii, sizeof (utf8_string_ascii), true)); + + /* Control character */ + lit_utf8_byte_t utf8_string_control[] = {0x00}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_control, sizeof (utf8_string_control), true)); + + /* 3 byte characters */ + lit_utf8_byte_t utf8_string_3byte[] = {0xe4, 0xbd, 0xa0, 0xe5, 0xa5, 0xbd, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_3byte, sizeof (utf8_string_3byte), true)); + + /* 4 byte characters */ + lit_utf8_byte_t utf8_string_4byte[] = {0xf0, 0x90, 0x80, 0x80, 0xf0, 0x9f, 0xa7, 0xbf}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_4byte, sizeof (utf8_string_4byte), true)); + + /* Invalid continuation byte */ + lit_utf8_byte_t utf8_string_invalid[] = {0xa0}; + TEST_ASSERT (!lit_is_valid_utf8_string (utf8_string_invalid, sizeof (utf8_string_invalid), true)); + + /* Isolated high surrogate */ + lit_utf8_byte_t utf8_string_high[] = {0xed, 0xa0, 0x80}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_high, sizeof (utf8_string_high), true)); + + /* Isolated low surrogate */ + lit_utf8_byte_t utf8_string_low[] = {0xed, 0xbf, 0xbf}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_low, sizeof (utf8_string_low), true)); + + /* Correct pair of surrogates in strict*/ + lit_utf8_byte_t utf8_string_surrogates_strict[] = {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf}; + TEST_ASSERT (!lit_is_valid_utf8_string (utf8_string_surrogates_strict, sizeof (utf8_string_surrogates_strict), true)); + + /* Correct pair of surrogates*/ + lit_utf8_byte_t utf8_string_surrogates[] = {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf}; + TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_surrogates, sizeof (utf8_string_surrogates), false)); + ecma_finalize (); jmem_finalize ();