Add core unicode functionality.

Add utf-8 processing routines.
Change ecma_char_t from char/uint16_t to uint16_t.
Apply all utf-8 processing routines.
Change char to jerry_api_char in API functions' declarations.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov
2015-06-29 19:17:17 +03:00
parent c4b0cd2196
commit fd9ff8e3bd
56 changed files with 2468 additions and 1480 deletions
+14 -11
View File
@@ -20,6 +20,7 @@
#include "jrt-libc-includes.h"
#include "mem-heap.h"
#include "re-compiler.h"
#include "re-parser.h"
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
@@ -382,7 +383,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
if (re_ctx_p->recursion_depth >= RE_COMPILE_RECURSION_LIMIT)
{
ret_value = ecma_raise_range_error ((const ecma_char_t *) "RegExp compiler recursion limit is exceeded.");
ret_value = ecma_raise_range_error ("RegExp compiler recursion limit is exceeded.");
return ret_value;
}
re_ctx_p->recursion_depth++;
@@ -575,7 +576,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
if (expect_eof)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of paren.");
ret_value = ecma_raise_syntax_error ("Unexpected end of paren.");
}
else
{
@@ -589,7 +590,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
{
if (!expect_eof)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of pattern.");
ret_value = ecma_raise_syntax_error ("Unexpected end of pattern.");
}
else
{
@@ -601,7 +602,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
}
default:
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected RegExp token.");
ret_value = ecma_raise_syntax_error ("Unexpected RegExp token.");
return ret_value;
}
}
@@ -619,8 +620,8 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
*/
ecma_completion_value_t
re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
ecma_string_t *pattern_str_p, /**< pattern */
uint8_t flags) /**< flags */
ecma_string_t *pattern_str_p, /**< pattern */
uint8_t flags) /**< flags */
{
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
re_compiler_ctx_t re_ctx;
@@ -636,10 +637,12 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
re_ctx.bytecode_ctx_p = &bc_ctx;
ecma_length_t pattern_str_len = ecma_string_get_length (pattern_str_p);
MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_len + 1, ecma_char_t);
ssize_t zt_str_size = (ssize_t) (sizeof (ecma_char_t) * (pattern_str_len + 1));
ecma_string_to_zt_string (pattern_str_p, pattern_start_p, zt_str_size);
lit_utf8_size_t pattern_str_size = ecma_string_get_size (pattern_str_p);
MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_size + 1, lit_utf8_byte_t);
ecma_string_to_utf8_string (pattern_str_p, pattern_start_p, (ssize_t) pattern_str_size);
FIXME ("Update regexp compiler so that zero symbol is not needed.");
pattern_start_p[pattern_str_size] = LIT_BYTE_NULL;
re_parser_ctx_t parser_ctx;
parser_ctx.pattern_start_p = pattern_start_p;
@@ -656,7 +659,7 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
/* 2. Check for invalid backreference */
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Invalid backreference.\n");
ret_value = ecma_raise_syntax_error ("Invalid backreference.\n");
}
else
{
+21 -21
View File
@@ -25,13 +25,13 @@
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
/* FIXME: change it, when unicode support would be implemented */
#define RE_LOOKUP(str_p, lookup) (ecma_zt_string_length (str_p) > lookup ? str_p[lookup] : '\0')
#define RE_LOOKUP(str_p, lookup) (lit_zt_utf8_string_size (str_p) > (lookup) ? str_p[lookup] : '\0')
/* FIXME: change it, when unicode support would be implemented */
#define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0)
static ecma_char_t
get_ecma_char (ecma_char_t** char_p)
get_ecma_char (lit_utf8_byte_t **char_p)
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
ecma_char_t ch = **char_p;
@@ -46,7 +46,7 @@ get_ecma_char (ecma_char_t** char_p)
* Returned value must be freed with ecma_free_completion_value
*/
static ecma_completion_value_t
parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
re_token_t *re_token_p, /**< output token */
uint32_t lookup, /**< size of lookup */
uint32_t *advance_p) /**< output length of current advance */
@@ -120,7 +120,7 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
{
if (digits >= ECMA_NUMBER_MAX_DIGITS)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: too many digits.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: too many digits.");
return ret_value;
}
digits++;
@@ -130,14 +130,14 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
{
if (qmax != RE_ITERATOR_INFINITE)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: double comma.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: double comma.");
return ret_value;
}
if ((RE_LOOKUP (pattern_p, lookup + *advance_p + 1)) == '}')
{
if (digits == 0)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: missing digits.");
return ret_value;
}
@@ -154,7 +154,7 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
{
if (digits == 0)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: missing digits.");
return ret_value;
}
@@ -174,7 +174,7 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
}
else
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: unknown char.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: unknown char.");
return ret_value;
}
}
@@ -206,7 +206,7 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
if (re_token_p->qmin > re_token_p->qmax)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: qmin > qmax.");
ret_value = ecma_raise_syntax_error ("RegExp quantifier error: qmin > qmax.");
}
return ret_value;
@@ -218,13 +218,13 @@ parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
static void
re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
{
ecma_char_t *pattern_p = parser_ctx_p->pattern_start_p;
lit_utf8_byte_t *pattern_p = parser_ctx_p->pattern_start_p;
ecma_char_t ch1;
int char_class_in = 0;
parser_ctx_p->num_of_groups = 0;
ch1 = get_ecma_char (&pattern_p);
while (ch1 != '\0')
while (ch1 != ECMA_CHAR_NULL)
{
ecma_char_t ch0 = ch1;
ch1 = get_ecma_char (&pattern_p);
@@ -275,7 +275,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
re_token_t *out_token_p) /**< output token */
{
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
ecma_char_t **pattern_p = &(parser_ctx_p->current_char_p);
lit_utf8_byte_t **pattern_p = &(parser_ctx_p->current_char_p);
out_token_p->qmax = out_token_p->qmin = 1;
ecma_char_t start = RE_CHAR_UNDEF;
@@ -338,7 +338,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
}
else
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape");
ret_value = ecma_raise_syntax_error ("invalid regexp control escape");
return ret_value;
}
}
@@ -433,7 +433,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
{
if (is_range)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range");
ret_value = ecma_raise_syntax_error ("invalid character class range");
return ret_value;
}
else
@@ -451,7 +451,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
{
if (start > ch)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range");
ret_value = ecma_raise_syntax_error ("invalid character class range");
return ret_value;
}
else
@@ -500,8 +500,8 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
{
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
uint32_t advance = 0;
ecma_char_t ch0 = *(parser_ctx_p->current_char_p);
ecma_char_t ch0 = *(parser_ctx_p->current_char_p);
switch (ch0)
{
case '|':
@@ -580,7 +580,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
}
else
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape");
ret_value = ecma_raise_syntax_error ("invalid regexp control escape");
break;
}
}
@@ -640,7 +640,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
{
if (isdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)))
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp escape pattern error.");
ret_value = ecma_raise_syntax_error ("RegExp escape pattern error.");
break;
}
@@ -664,13 +664,13 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
{
if (index >= RE_MAX_RE_DECESC_DIGITS)
{
ret_value = ecma_raise_syntax_error ((const ecma_char_t *)
"RegExp escape pattern error: decimal escape too long.");
ret_value = ecma_raise_syntax_error ("RegExp escape pattern error: decimal escape too long.");
return ret_value;
}
advance++;
ecma_char_t digit = RE_LOOKUP (parser_ctx_p->current_char_p, advance);
ecma_char_t digit = RE_LOOKUP (parser_ctx_p->current_char_p,
advance);
if (!isdigit (digit))
{
break;
+2 -2
View File
@@ -71,8 +71,8 @@ typedef struct
typedef struct
{
ecma_char_t *pattern_start_p;
ecma_char_t *current_char_p;
lit_utf8_byte_t *pattern_start_p;
lit_utf8_byte_t *current_char_p;
int num_of_groups;
uint32_t num_of_classes;
} re_parser_ctx_t;