Implement \u{hex} support. (#3447)

A large rework because surrogate pairs must be combined. Currently only the 0x10C80..0x10CF2 is accepted as valid identifier character from the non-basic plane. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
2019-12-16 11:26:02 +01:00
parent 1db16c3a1c
commit 40d930d62c
22 changed files with 765 additions and 370 deletions
@@ -21,6 +21,39 @@

 #include "test-common.h"

+static lit_code_point_t
+lexer_hex_to_character (const uint8_t *source_p) /**< current source position */
+{
+  lit_code_point_t result = 0;
+
+  do
+  {
+    uint32_t byte = *source_p++;
+
+    result <<= 4;
+
+    if (byte >= LIT_CHAR_0 && byte <= LIT_CHAR_9)
+    {
+      result += byte - LIT_CHAR_0;
+    }
+    else
+    {
+      byte = LEXER_TO_ASCII_LOWERCASE (byte);
+      if (byte >= LIT_CHAR_LOWERCASE_A && byte <= LIT_CHAR_LOWERCASE_F)
+      {
+        result += byte - (LIT_CHAR_LOWERCASE_A - 10);
+      }
+      else
+      {
+        return UINT32_MAX;
+      }
+    }
+  }
+  while (*source_p);
+
+  return result;
+} /* lexer_hex_to_character */
+
 int
 main (void)
 {
@@ -29,50 +62,59 @@ main (void)
  jmem_init ();
  ecma_init ();

-  const uint8_t _1_byte_long1[] = "\\u007F";
-  const uint8_t _1_byte_long2[] = "\\u0000";
-  const uint8_t _1_byte_long3[] = "\\u0065";
+  const uint8_t _1_byte_long1[] = "007F";
+  const uint8_t _1_byte_long2[] = "0000";
+  const uint8_t _1_byte_long3[] = "0065";

-  const uint8_t _2_byte_long1[] = "\\u008F";
-  const uint8_t _2_byte_long2[] = "\\u00FF";
-  const uint8_t _2_byte_long3[] = "\\u07FF";
+  const uint8_t _2_byte_long1[] = "008F";
+  const uint8_t _2_byte_long2[] = "00FF";
+  const uint8_t _2_byte_long3[] = "07FF";

-  const uint8_t _3_byte_long1[] = "\\u08FF";
-  const uint8_t _3_byte_long2[] = "\\u0FFF";
-  const uint8_t _3_byte_long3[] = "\\uFFFF";
+  const uint8_t _3_byte_long1[] = "08FF";
+  const uint8_t _3_byte_long2[] = "0FFF";
+  const uint8_t _3_byte_long3[] = "FFFF";
+
+  const uint8_t _6_byte_long1[] = "10000";
+  const uint8_t _6_byte_long2[] = "10FFFF";

  size_t length;

  /* Test 1-byte-long unicode sequences. */
-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long1 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long1));
  TEST_ASSERT (length == 1);

-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long2 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long2));
  TEST_ASSERT (length == 1);

-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long3 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long3));
  TEST_ASSERT (length == 1);

  /* Test 2-byte-long unicode sequences. */
-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long1 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long1));
  TEST_ASSERT (length == 2);

-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long2 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long2));
  TEST_ASSERT (length == 2);

-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long3 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long3));
  TEST_ASSERT (length == 2);

  /* Test 3-byte-long unicode sequences. */
-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long1 + 2, 4));
-  TEST_ASSERT (length != 2);
-
-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long2 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long1));
  TEST_ASSERT (length == 3);

-  length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long3 + 2, 4));
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long2));
  TEST_ASSERT (length == 3);

+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long3));
+  TEST_ASSERT (length == 3);
+
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long1));
+  TEST_ASSERT (length == 6);
+
+  length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long2));
+  TEST_ASSERT (length == 6);
+
  ecma_finalize ();
  jmem_finalize ();

@@ -131,7 +131,7 @@ main (void)

    while (curr_p < end_p)
    {
-      code_units[code_units_count] = lit_utf8_peek_next (curr_p);
+      code_units[code_units_count] = lit_cesu8_peek_next (curr_p);
      saved_positions[code_units_count] = curr_p;
      code_units_count++;
      calculated_length++;
@@ -147,7 +147,7 @@ main (void)
      {
        ecma_length_t index = (ecma_length_t) rand () % code_units_count;
        curr_p = saved_positions[index];
-        TEST_ASSERT (lit_utf8_peek_next (curr_p) == code_units[index]);
+        TEST_ASSERT (lit_cesu8_peek_next (curr_p) == code_units[index]);
      }
    }

@@ -156,7 +156,7 @@ main (void)
    {
      TEST_ASSERT (code_units_count > 0);
      calculated_length--;
-      TEST_ASSERT (code_units[calculated_length] == lit_utf8_peek_prev (curr_p));
+      TEST_ASSERT (code_units[calculated_length] == lit_cesu8_peek_prev (curr_p));
      lit_utf8_decr (&curr_p);
    }

@@ -164,7 +164,7 @@ main (void)

    while (curr_p < end_p)
    {
-      ecma_char_t code_unit = lit_utf8_read_next (&curr_p);
+      ecma_char_t code_unit = lit_cesu8_read_next (&curr_p);
      TEST_ASSERT (code_unit == code_units[calculated_length]);
      calculated_length++;
    }
@@ -175,7 +175,7 @@ main (void)
    {
      TEST_ASSERT (code_units_count > 0);
      calculated_length--;
-      TEST_ASSERT (code_units[calculated_length] == lit_utf8_read_prev (&curr_p));
+      TEST_ASSERT (code_units[calculated_length] == lit_cesu8_read_prev (&curr_p));
    }

    TEST_ASSERT (calculated_length == 0);
@@ -0,0 +1,61 @@
+/* Copyright JS Foundation and other contributors, http://js.foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "jerryscript.h"
+#include "test-common.h"
+
+static bool
+test_syntax_error (char *script_p) /**< script */
+{
+  jerry_value_t parse_result = jerry_parse (NULL,
+                                            0,
+                                            (const jerry_char_t *) script_p,
+                                            strlen (script_p),
+                                            JERRY_PARSE_NO_OPTS);
+
+  bool result = false;
+
+  if (jerry_value_is_error (parse_result))
+  {
+    result = true;
+    TEST_ASSERT (jerry_get_error_type (parse_result) == JERRY_ERROR_SYNTAX);
+  }
+
+  jerry_release_value (parse_result);
+  return result;
+} /* test_syntax_error */
+
+int
+main (void)
+{
+  jerry_init (JERRY_INIT_EMPTY);
+
+  if (!test_syntax_error ("\\u{61}"))
+  {
+    TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \\u{10C80}"));
+    /* The \u surrogate pairs are ignored. The \u{hex} form must be used. */
+    TEST_ASSERT (test_syntax_error ("\xF0\x90\xB2\x80: break \\ud803\\udc80"));
+    /* The utf8 code point and the cesu8 surrogate pair must match. */
+    TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \xed\xa0\x83\xed\xb2\x80"));
+
+    TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\u{10C80}$"));
+    TEST_ASSERT (test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\ud803\\udc80$"));
+    TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\xed\xa0\x83\xed\xb2\x80$"));
+  }
+
+  jerry_cleanup ();
+
+  return 0;
+} /* main */