Implement \u{hex} support. (#3447)

A large rework because surrogate pairs must be combined.

Currently only the 0x10C80..0x10CF2 is accepted as valid identifier character from the non-basic plane.

JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
This commit is contained in:
Zoltan Herczeg
2019-12-16 11:26:02 +01:00
committed by Dániel Bátyai
parent 1db16c3a1c
commit 40d930d62c
22 changed files with 765 additions and 370 deletions
+62 -20
View File
@@ -21,6 +21,39 @@
#include "test-common.h"
static lit_code_point_t
lexer_hex_to_character (const uint8_t *source_p) /**< current source position */
{
lit_code_point_t result = 0;
do
{
uint32_t byte = *source_p++;
result <<= 4;
if (byte >= LIT_CHAR_0 && byte <= LIT_CHAR_9)
{
result += byte - LIT_CHAR_0;
}
else
{
byte = LEXER_TO_ASCII_LOWERCASE (byte);
if (byte >= LIT_CHAR_LOWERCASE_A && byte <= LIT_CHAR_LOWERCASE_F)
{
result += byte - (LIT_CHAR_LOWERCASE_A - 10);
}
else
{
return UINT32_MAX;
}
}
}
while (*source_p);
return result;
} /* lexer_hex_to_character */
int
main (void)
{
@@ -29,50 +62,59 @@ main (void)
jmem_init ();
ecma_init ();
const uint8_t _1_byte_long1[] = "\\u007F";
const uint8_t _1_byte_long2[] = "\\u0000";
const uint8_t _1_byte_long3[] = "\\u0065";
const uint8_t _1_byte_long1[] = "007F";
const uint8_t _1_byte_long2[] = "0000";
const uint8_t _1_byte_long3[] = "0065";
const uint8_t _2_byte_long1[] = "\\u008F";
const uint8_t _2_byte_long2[] = "\\u00FF";
const uint8_t _2_byte_long3[] = "\\u07FF";
const uint8_t _2_byte_long1[] = "008F";
const uint8_t _2_byte_long2[] = "00FF";
const uint8_t _2_byte_long3[] = "07FF";
const uint8_t _3_byte_long1[] = "\\u08FF";
const uint8_t _3_byte_long2[] = "\\u0FFF";
const uint8_t _3_byte_long3[] = "\\uFFFF";
const uint8_t _3_byte_long1[] = "08FF";
const uint8_t _3_byte_long2[] = "0FFF";
const uint8_t _3_byte_long3[] = "FFFF";
const uint8_t _6_byte_long1[] = "10000";
const uint8_t _6_byte_long2[] = "10FFFF";
size_t length;
/* Test 1-byte-long unicode sequences. */
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long1 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long1));
TEST_ASSERT (length == 1);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long2 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long2));
TEST_ASSERT (length == 1);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long3 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long3));
TEST_ASSERT (length == 1);
/* Test 2-byte-long unicode sequences. */
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long1 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long1));
TEST_ASSERT (length == 2);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long2 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long2));
TEST_ASSERT (length == 2);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long3 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long3));
TEST_ASSERT (length == 2);
/* Test 3-byte-long unicode sequences. */
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long1 + 2, 4));
TEST_ASSERT (length != 2);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long2 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long1));
TEST_ASSERT (length == 3);
length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long3 + 2, 4));
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long2));
TEST_ASSERT (length == 3);
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long3));
TEST_ASSERT (length == 3);
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long1));
TEST_ASSERT (length == 6);
length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long2));
TEST_ASSERT (length == 6);
ecma_finalize ();
jmem_finalize ();
+5 -5
View File
@@ -131,7 +131,7 @@ main (void)
while (curr_p < end_p)
{
code_units[code_units_count] = lit_utf8_peek_next (curr_p);
code_units[code_units_count] = lit_cesu8_peek_next (curr_p);
saved_positions[code_units_count] = curr_p;
code_units_count++;
calculated_length++;
@@ -147,7 +147,7 @@ main (void)
{
ecma_length_t index = (ecma_length_t) rand () % code_units_count;
curr_p = saved_positions[index];
TEST_ASSERT (lit_utf8_peek_next (curr_p) == code_units[index]);
TEST_ASSERT (lit_cesu8_peek_next (curr_p) == code_units[index]);
}
}
@@ -156,7 +156,7 @@ main (void)
{
TEST_ASSERT (code_units_count > 0);
calculated_length--;
TEST_ASSERT (code_units[calculated_length] == lit_utf8_peek_prev (curr_p));
TEST_ASSERT (code_units[calculated_length] == lit_cesu8_peek_prev (curr_p));
lit_utf8_decr (&curr_p);
}
@@ -164,7 +164,7 @@ main (void)
while (curr_p < end_p)
{
ecma_char_t code_unit = lit_utf8_read_next (&curr_p);
ecma_char_t code_unit = lit_cesu8_read_next (&curr_p);
TEST_ASSERT (code_unit == code_units[calculated_length]);
calculated_length++;
}
@@ -175,7 +175,7 @@ main (void)
{
TEST_ASSERT (code_units_count > 0);
calculated_length--;
TEST_ASSERT (code_units[calculated_length] == lit_utf8_read_prev (&curr_p));
TEST_ASSERT (code_units[calculated_length] == lit_cesu8_read_prev (&curr_p));
}
TEST_ASSERT (calculated_length == 0);
+61
View File
@@ -0,0 +1,61 @@
/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "jerryscript.h"
#include "test-common.h"
static bool
test_syntax_error (char *script_p) /**< script */
{
jerry_value_t parse_result = jerry_parse (NULL,
0,
(const jerry_char_t *) script_p,
strlen (script_p),
JERRY_PARSE_NO_OPTS);
bool result = false;
if (jerry_value_is_error (parse_result))
{
result = true;
TEST_ASSERT (jerry_get_error_type (parse_result) == JERRY_ERROR_SYNTAX);
}
jerry_release_value (parse_result);
return result;
} /* test_syntax_error */
int
main (void)
{
jerry_init (JERRY_INIT_EMPTY);
if (!test_syntax_error ("\\u{61}"))
{
TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \\u{10C80}"));
/* The \u surrogate pairs are ignored. The \u{hex} form must be used. */
TEST_ASSERT (test_syntax_error ("\xF0\x90\xB2\x80: break \\ud803\\udc80"));
/* The utf8 code point and the cesu8 surrogate pair must match. */
TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \xed\xa0\x83\xed\xb2\x80"));
TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\u{10C80}$"));
TEST_ASSERT (test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\ud803\\udc80$"));
TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\xed\xa0\x83\xed\xb2\x80$"));
}
jerry_cleanup ();
return 0;
} /* main */