Implement \u{hex} support. (#3447)

A large rework because surrogate pairs must be combined. Currently only the 0x10C80..0x10CF2 is accepted as valid identifier character from the non-basic plane. JerryScript-DCO-1.0-Signed-off-by: Zoltan Herczeg zherczeg.u-szeged@partner.samsung.com
2019-12-16 11:26:02 +01:00
parent 1db16c3a1c
commit 40d930d62c
22 changed files with 765 additions and 370 deletions
@@ -200,73 +200,33 @@ lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
                                        NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars)));
 } /* lit_char_is_unicode_non_letter_ident_part */

-/**
- * Checks whether the next UTF8 character is a valid identifier start.
- *
- * @return true if it is.
- */
-bool
-lit_char_is_identifier_start (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */
-{
-  if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
-  {
-    return lit_char_is_identifier_start_character (*src_p);
-  }
-
-  /* ECMAScript 2015 specification allows some code points in supplementary plane.
-   * However, we don't permit characters in supplementary characters as start of identifier.
-   */
-  if ((*src_p & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
-  {
-    return false;
-  }
-
-  return lit_char_is_identifier_start_character (lit_utf8_peek_next (src_p));
-} /* lit_char_is_identifier_start */
-
 /**
 * Checks whether the character is a valid identifier start.
 *
 * @return true if it is.
 */
 bool
-lit_char_is_identifier_start_character (uint16_t chr) /**< EcmaScript character */
+lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */
 {
  /* Fast path for ASCII-defined letters. */
-  if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
+  if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
  {
-    return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A
-             && LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z)
-            || chr == LIT_CHAR_DOLLAR_SIGN
-            || chr == LIT_CHAR_UNDERSCORE);
+    return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
+             && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
+            || code_point == LIT_CHAR_DOLLAR_SIGN
+            || code_point == LIT_CHAR_UNDERSCORE);
  }

-  return lit_char_is_unicode_letter (chr);
-} /* lit_char_is_identifier_start_character */
-
-/**
- * Checks whether the next UTF8 character is a valid identifier part.
- *
- * @return true if it is.
- */
-bool
-lit_char_is_identifier_part (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */
-{
-  if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
+#if ENABLED (JERRY_ES2015)
+  if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
  {
-    return lit_char_is_identifier_part_character (*src_p);
+    /* TODO: detect these ranges correctly. */
+    return (code_point >= 0x10C80 && code_point <= 0x10CF2);
  }
+#endif /* ENABLED (JERRY_ES2015) */

-  /* ECMAScript 2015 specification allows some code points in supplementary plane.
-   * However, we don't permit characters in supplementary characters as part of identifier.
-   */
-  if ((*src_p & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
-  {
-    return false;
-  }
-
-  return lit_char_is_identifier_part_character (lit_utf8_peek_next (src_p));
-} /* lit_char_is_identifier_part */
+  return lit_char_is_unicode_letter ((ecma_char_t) code_point);
+} /* lit_code_point_is_identifier_start */

 /**
 * Checks whether the character is a valid identifier part.
@@ -274,21 +234,29 @@ lit_char_is_identifier_part (const uint8_t *src_p) /**< pointer to a vaild UTF8
 * @return true if it is.
 */
 bool
-lit_char_is_identifier_part_character (uint16_t chr) /**< EcmaScript character */
+lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */
 {
  /* Fast path for ASCII-defined letters. */
-  if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
+  if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
  {
-    return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A
-             && LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z)
-            || (chr >= LIT_CHAR_0 && chr <= LIT_CHAR_9)
-            || chr == LIT_CHAR_DOLLAR_SIGN
-            || chr == LIT_CHAR_UNDERSCORE);
+    return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
+             && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
+            || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9)
+            || code_point == LIT_CHAR_DOLLAR_SIGN
+            || code_point == LIT_CHAR_UNDERSCORE);
  }

-  return (lit_char_is_unicode_letter (chr)
-          || lit_char_is_unicode_non_letter_ident_part (chr));
-} /* lit_char_is_identifier_part_character */
+#if ENABLED (JERRY_ES2015)
+  if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN)
+  {
+    /* TODO: detect these ranges correctly. */
+    return (code_point >= 0x10C80 && code_point <= 0x10CF2);
+  }
+#endif /* ENABLED (JERRY_ES2015) */
+
+  return (lit_char_is_unicode_letter ((ecma_char_t) code_point)
+          || lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point));
+} /* lit_code_point_is_identifier_part */

 /**
 * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
@@ -356,30 +324,47 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
 * @return length of the UTF8 representation.
 */
 size_t
-lit_char_to_utf8_bytes (uint8_t *dst_p, /**< destination buffer */
-                        ecma_char_t chr) /**< EcmaScript character */
+lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */
+                               lit_code_point_t code_point) /**< code point */
 {
-  if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX))
+  if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
  {
    /* 00000000 0xxxxxxx -> 0xxxxxxx */
-    *dst_p = (uint8_t) chr;
+    dst_p[0] = (uint8_t) code_point;
    return 1;
  }

-  if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX))
+  if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
  {
    /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
-    *(dst_p++) = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
-    *dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK));
+    dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
+    dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
    return 2;
  }

-  /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
-  *(dst_p++) = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((chr >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
-  *(dst_p++) = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
-  *dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK));
-  return 3;
-} /* lit_char_to_utf8_bytes */
+  if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
+  {
+    /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
+    dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
+    dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
+    dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
+    return 3;
+  }
+
+  JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
+
+  code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
+
+  dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
+  dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK));
+  dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK));
+
+  dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
+  dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK));
+  dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
+
+  return 3 * 2;
+} /* lit_code_point_to_cesu8_bytes */

 /**
 * Returns the length of the UTF8 representation of a character.
@@ -387,23 +372,44 @@ lit_char_to_utf8_bytes (uint8_t *dst_p, /**< destination buffer */
 * @return length of the UTF8 representation.
 */
 size_t
-lit_char_get_utf8_length (ecma_char_t chr) /**< EcmaScript character */
+lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */
 {
-  if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX))
+  if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
  {
    /* 00000000 0xxxxxxx */
    return 1;
  }

-  if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX))
+  if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
  {
    /* 00000yyy yyxxxxxx */
    return 2;
  }

-  /* zzzzyyyy yyxxxxxx */
-  return 3;
-} /* lit_char_get_utf8_length */
+  if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
+  {
+    /* zzzzyyyy yyxxxxxx */
+    return 3;
+  }
+
+  /* high + low surrogate */
+  return 2 * 3;
+} /* lit_code_point_get_cesu8_length */
+
+/**
+ * Convert a four byte long utf8 character to two three byte long cesu8 characters
+ */
+void
+lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
+                                  const uint8_t *source_p) /**< source buffer */
+{
+  lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18);
+  code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
+  code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
+  code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK);
+
+  lit_code_point_to_cesu8_bytes (dst_p, code_point);
+} /* lit_four_byte_utf8_char_to_cesu8 */

 /**
 * Parse the next number_of_characters hexadecimal character,
@@ -75,10 +75,8 @@ bool lit_char_is_line_terminator (ecma_char_t c);
 #define LIT_CHAR_UNDERSCORE  ((ecma_char_t) '_')  /* low line (underscore) */
 /* LIT_CHAR_BACKSLASH defined above */

-bool lit_char_is_identifier_start (const uint8_t *src_p);
-bool lit_char_is_identifier_part (const uint8_t *src_p);
-bool lit_char_is_identifier_start_character (ecma_char_t chr);
-bool lit_char_is_identifier_part_character (ecma_char_t chr);
+bool lit_code_point_is_identifier_start (lit_code_point_t code_point);
+bool lit_code_point_is_identifier_part (lit_code_point_t code_point);

 /*
 * Punctuator characters (ECMA-262 v5, 7.7)
@@ -215,8 +213,9 @@ bool lit_char_is_octal_digit (ecma_char_t c);
 bool lit_char_is_decimal_digit (ecma_char_t c);
 bool lit_char_is_hex_digit (ecma_char_t c);
 uint32_t lit_char_hex_to_int (ecma_char_t c);
-size_t lit_char_to_utf8_bytes (uint8_t *dst_p, ecma_char_t chr);
-size_t lit_char_get_utf8_length (ecma_char_t chr);
+size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point);
+size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point);
+void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p);

 /* read a hex encoded code point from a zero terminated buffer */
 bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters,
@@ -481,7 +481,7 @@ lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer wit
 * @return next code unit
 */
 ecma_char_t
-lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
+lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
 {
  JERRY_ASSERT (*buf_p);
  ecma_char_t ch;
@@ -489,7 +489,7 @@ lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha
  *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);

  return ch;
-} /* lit_utf8_read_next */
+} /* lit_cesu8_read_next */

 /**
 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
@@ -497,7 +497,7 @@ lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha
 * @return previous code unit
 */
 ecma_char_t
-lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
+lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
 {
  JERRY_ASSERT (*buf_p);
  ecma_char_t ch;
@@ -506,7 +506,7 @@ lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha
  lit_read_code_unit_from_utf8 (*buf_p, &ch);

  return ch;
-} /* lit_utf8_read_prev */
+} /* lit_cesu8_read_prev */

 /**
 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
@@ -514,15 +514,15 @@ lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha
 * @return next code unit
 */
 ecma_char_t
-lit_utf8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
+lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
 {
-  JERRY_ASSERT (buf_p);
+  JERRY_ASSERT (buf_p != NULL);
  ecma_char_t ch;

  lit_read_code_unit_from_utf8 (buf_p, &ch);

  return ch;
-} /* lit_utf8_peek_next */
+} /* lit_cesu8_peek_next */

 /**
 * Decodes a unicode code unit from non-empty cesu-8-encoded buffer
@@ -530,15 +530,15 @@ lit_utf8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with char
 * @return previous code unit
 */
 ecma_char_t
-lit_utf8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
+lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
 {
-  JERRY_ASSERT (buf_p);
+  JERRY_ASSERT (buf_p != NULL);
  ecma_char_t ch;

  lit_read_prev_code_unit_from_utf8 (buf_p, &ch);

  return ch;
-} /* lit_utf8_peek_prev */
+} /* lit_cesu8_peek_prev */

 /**
 * Increase cesu-8 encoded string pointer by one code unit.
@@ -46,7 +46,6 @@
 #define LIT_UTF8_2_BYTE_MARKER (0xC0)
 #define LIT_UTF8_3_BYTE_MARKER (0xE0)
 #define LIT_UTF8_4_BYTE_MARKER (0xF0)
-#define LIT_UTF8_5_BYTE_MARKER (0xF8)
 #define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)

 #define LIT_UTF8_1_BYTE_MASK (0x80)
@@ -82,7 +81,7 @@
 /**
 * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings
 */
-#define LIT_UTF8_FIRST_BYTE_MAX LIT_UTF8_5_BYTE_MARKER
+#define LIT_UTF8_FIRST_BYTE_MAX (0xF8)

 /* validation */
 bool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size);
@@ -135,10 +134,10 @@ lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
 lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p,
                                                   ecma_char_t *code_point);

-ecma_char_t lit_utf8_read_next (const lit_utf8_byte_t **buf_p);
-ecma_char_t lit_utf8_read_prev (const lit_utf8_byte_t **buf_p);
-ecma_char_t lit_utf8_peek_next (const lit_utf8_byte_t *buf_p);
-ecma_char_t lit_utf8_peek_prev (const lit_utf8_byte_t *buf_p);
+ecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p);
+ecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p);
+ecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p);
+ecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p);
 void lit_utf8_incr (const lit_utf8_byte_t **buf_p);
 void lit_utf8_decr (const lit_utf8_byte_t **buf_p);