Add preprocessor definitions for various character codes, referenced in ECMA-262, and helpers for checking if a character falls into a ECMA-defined character category; remove lit_char_is_carriage_return and lit_char_is_new_line helpers.

JerryScript-DCO-1.0-Signed-off-by: Ruben Ayrapetyan r.ayrapetyan@samsung.com
2015-07-01 21:57:15 +03:00
parent 7d53133fcb
commit 2a78d24fce
3 changed files with 419 additions and 48 deletions
@@ -16,44 +16,242 @@
 #include "lit-char-helpers.h"

 /**
- * Check if specified character is the newline character
+ * Check if specified character is one of the Format-Control characters
 *
- * @return true - if the character is "<LF>" character according to ECMA-262 v5, Table 3,
+ * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 1,
 *         false - otherwise.
 */
 bool
-lit_char_is_new_line (ecma_char_t c) /**< code unit */
+lit_char_is_format_control (ecma_char_t c) /**< code unit */
 {
-  return (c == '\x0A');
-} /* lit_char_is_new_line */
+  return (c == LIT_CHAR_ZWNJ
+          || c == LIT_CHAR_ZWJ
+          || c == LIT_CHAR_BOM);
+} /* lit_char_is_format_control */

 /**
- * Check if specified character the carriage return character
+ * Check if specified character is the Space Separator character
 *
- * @return true - if the character is "<CR>" character according to ECMA-262 v5, Table 3,
+ * See also:
+ *          ECMA-262 v5, Table 2
+ *
+ * @return true - if the character falls into "Space, Separator" ("Zs") character category,
 *         false - otherwise.
 */
 bool
-lit_char_is_carriage_return (ecma_char_t c) /**< code unit */
+lit_char_is_space_separator (ecma_char_t c) /**< code unit */
 {
-  return (c == '\x0D');
-} /* lit_char_is_carriage_return */
+  /* Zs */
+#define LIT_UNICODE_RANGE_ZS(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+#include "lit-unicode-ranges.inc.h"
+
+  return false;
+} /* lit_char_is_space_separator */

 /**
- * Check if specified character is one of LineTerminator (ECMA-262 v5, Table 3) characters
+ * Check if specified character is one of the Whitespace characters
 *
- * @return true - if the character is one of LineTerminator characters,
+ * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
+ *         false - otherwise.
+ */
+bool
+lit_char_is_white_space (ecma_char_t c) /**< code unit */
+{
+  return (c == LIT_CHAR_TAB
+          || c == LIT_CHAR_VTAB
+          || c == LIT_CHAR_FF
+          || c == LIT_CHAR_SP
+          || c == LIT_CHAR_NBSP
+          || c == LIT_CHAR_BOM
+          || lit_char_is_space_separator (c));
+} /* lit_char_is_white_space */
+
+/**
+ * Check if specified character is one of LineTerminator characters
+ *
+ * @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
 *         false - otherwise.
 */
 bool
 lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
 {
-  /* FIXME: Handle <LS> and <PS> (ECMA-262 v5, 7.3, Table 3) when Unicode would be supported */
-
-  return (lit_char_is_carriage_return (c)
-          || lit_char_is_new_line (c));
+  return (c == LIT_CHAR_LF
+          || c == LIT_CHAR_CR
+          || c == LIT_CHAR_LS
+          || c == LIT_CHAR_PS);
 } /* lit_char_is_line_terminator */

+/**
+ * Check if specified character is a unicode letter
+ *
+ * Note:
+ *      Unicode letter is a character, included into one of the following categories:
+ *       - Uppercase letter (Lu);
+ *       - Lowercase letter (Ll);
+ *       - Titlecase letter (Lt);
+ *       - Modifier letter (Lm);
+ *       - Other letter (Lo);
+ *       - Letter number (Nl).
+ *
+ * See also:
+ *          ECMA-262 v5, 7.6
+ *
+ * @return true - if specified character falls into one of the listed categories,
+ *         false - otherwise.
+ */
+bool
+lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
+{
+  /* Fast path for ASCII-defined letters */
+  if ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
+      || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END))
+  {
+    return true;
+  }
+
+  /* Lu */
+#define LIT_UNICODE_RANGE_LU(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Ll */
+#define LIT_UNICODE_RANGE_LL(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Lt */
+#define LIT_UNICODE_RANGE_LT(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Lm */
+#define LIT_UNICODE_RANGE_LM(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Lo */
+#define LIT_UNICODE_RANGE_LO(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Nl */
+#define LIT_UNICODE_RANGE_NL(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+#include "lit-unicode-ranges.inc.h"
+
+  return false;
+} /* lit_char_is_unicode_letter */
+
+/**
+ * Check if specified character is a unicode combining mark
+ *
+ * Note:
+ *      Unicode combining mark is a character, included into one of the following categories:
+ *       - Non-spacing mark (Mn);
+ *       - Combining spacing mark (Mc).
+ *
+ * See also:
+ *          ECMA-262 v5, 7.6
+ *
+ * @return true - if specified character falls into one of the listed categories,
+ *         false - otherwise.
+ */
+bool
+lit_char_is_unicode_combining_mark (ecma_char_t c) /**< code unit */
+{
+  /* Mn */
+#define LIT_UNICODE_RANGE_MN(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+  /* Mc */
+#define LIT_UNICODE_RANGE_MC(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+#include "lit-unicode-ranges.inc.h"
+
+  return false;
+} /* lit_char_is_unicode_combining_mark */
+
+/**
+ * Check if specified character is a unicode digit
+ *
+ * Note:
+ *      Unicode digit is a character, included into the following category:
+ *       - Decimal number (Nd).
+ *
+ * See also:
+ *          ECMA-262 v5, 7.6
+ *
+ * @return true - if specified character falls into the specified category,
+ *         false - otherwise.
+ */
+bool
+lit_char_is_unicode_digit (ecma_char_t c) /**< code unit */
+{
+  /* Nd */
+#define LIT_UNICODE_RANGE_ND(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+#include "lit-unicode-ranges.inc.h"
+
+  return false;
+} /* lit_char_is_unicode_digit */
+
+/**
+ * Check if specified character is a unicode connector punctuation
+ *
+ * Note:
+ *      Unicode connector punctuation is a character, included into the following category:
+ *       - Connector punctuation (Pc).
+ *
+ * See also:
+ *          ECMA-262 v5, 7.6
+ *
+ * @return true - if specified character falls into the specified category,
+ *         false - otherwise.
+ */
+bool
+lit_char_is_unicode_connector_punctuation (ecma_char_t c) /**< code unit */
+{
+  /* Pc */
+#define LIT_UNICODE_RANGE_PC(range_begin, range_end) \
+  if (c >= (range_begin) && c <= (range_end)) \
+  { \
+    return true; \
+  }
+
+#include "lit-unicode-ranges.inc.h"
+
+  return false;
+} /* lit_char_is_unicode_connector_punctuation */
+
 /**
 * Check if specified character is a word character (part of IsWordChar abstract operation)
 *
@@ -65,10 +263,10 @@ lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
 bool
 lit_char_is_word_char (ecma_char_t c) /**< code unit */
 {
-  if ((c >= 'a' && c <= 'z')
-      || (c >= 'A' && c <= 'Z')
-      || (c >= '0' && c <= '9')
-      || c == '_')
+  if ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
+      || (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
+      || (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
+      || c == LIT_CHAR_UNDERSCORE)
  {
    return true;
  }
@@ -85,30 +283,18 @@ uint32_t
 lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
                                     *    one of [0-9A-Fa-f] characters */
 {
-  switch (c)
+  if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
  {
-    case '0': return 0x0;
-    case '1': return 0x1;
-    case '2': return 0x2;
-    case '3': return 0x3;
-    case '4': return 0x4;
-    case '5': return 0x5;
-    case '6': return 0x6;
-    case '7': return 0x7;
-    case '8': return 0x8;
-    case '9': return 0x9;
-    case 'a':
-    case 'A': return 0xA;
-    case 'b':
-    case 'B': return 0xB;
-    case 'c':
-    case 'C': return 0xC;
-    case 'd':
-    case 'D': return 0xD;
-    case 'e':
-    case 'E': return 0xE;
-    case 'f':
-    case 'F': return 0xF;
-    default: JERRY_UNREACHABLE ();
+    return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
+  }
+  else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
+  {
+    return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10);
+  }
+  else
+  {
+    JERRY_ASSERT (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END);
+
+    return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10);
  }
 } /* lit_char_hex_to_int */