Switch lexer to utf8 iterators.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov
2015-07-03 22:03:03 +03:00
committed by Ruben Ayrapetyan
parent d248d0944c
commit 32318137c3
+101 -72
View File
@@ -30,9 +30,10 @@ static size_t buffer_size = 0;
/* Represents the contents of a script. */ /* Represents the contents of a script. */
static const jerry_api_char_t *buffer_start = NULL; static const jerry_api_char_t *buffer_start = NULL;
static const jerry_api_char_t *buffer = NULL;
static const jerry_api_char_t *token_start; static const jerry_api_char_t *token_start;
static lit_utf8_iterator_t src_iter;
#define LA(I) (get_char (I)) #define LA(I) (get_char (I))
static bool static bool
@@ -46,7 +47,7 @@ current_locus (void)
{ {
if (token_start == NULL) if (token_start == NULL)
{ {
return (locus) (buffer - buffer_start); return lit_utf8_iterator_get_offset (&src_iter);
} }
else else
{ {
@@ -57,18 +58,26 @@ current_locus (void)
static ecma_char_t static ecma_char_t
get_char (size_t i) get_char (size_t i)
{ {
if ((buffer + i) >= (buffer_start + buffer_size)) lit_utf8_iterator_t iter = src_iter;
ecma_char_t code_unit;
do
{ {
return '\0'; if (lit_utf8_iterator_is_eos (&iter))
{
code_unit = LIT_CHAR_NULL;
break;
}
code_unit = lit_utf8_iterator_read_next (&iter);
} }
return *(buffer + i); while (i--);
return code_unit;
} }
static void static void
dump_current_line (void) dump_current_line (void)
{ {
const lit_utf8_byte_t *i;
if (!allow_dump_lines) if (!allow_dump_lines)
{ {
return; return;
@@ -76,12 +85,20 @@ dump_current_line (void)
printf ("// "); printf ("// ");
FIXME ("Unicode: properly process non-ascii characters."); lit_utf8_iterator_t iter = src_iter;
for (i = buffer; *i != '\n' && *i != 0; i++)
while (!lit_utf8_iterator_is_eos (&iter))
{ {
putchar (*i); ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter);
if (code_unit == '\n')
{
break;
}
lit_put_ecma_char (code_unit);
} }
putchar ('\n');
lit_put_ecma_char ('\n');
} }
static token static token
@@ -284,22 +301,21 @@ convert_seen_num_to_token (ecma_number_t num)
static void static void
new_token (void) new_token (void)
{ {
JERRY_ASSERT (buffer); JERRY_ASSERT (lit_utf8_iterator_get_ptr (&src_iter));
token_start = buffer; token_start = lit_utf8_iterator_get_ptr (&src_iter);
} }
static void static void
consume_char (void) consume_char (void)
{ {
JERRY_ASSERT (buffer); lit_utf8_iterator_incr (&src_iter);
buffer++;
} }
#define RETURN_PUNC_EX(TOK, NUM) \ #define RETURN_PUNC_EX(TOK, NUM) \
do \ do \
{ \ { \
token tok = create_token (TOK, 0); \ token tok = create_token (TOK, 0); \
buffer += NUM; \ lit_utf8_iterator_advance (&src_iter, NUM); \
return tok; \ return tok; \
} \ } \
while (0) while (0)
@@ -444,37 +460,29 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
} }
lit_utf8_byte_t *str_buf_p = (lit_utf8_byte_t*) jsp_mm_alloc (source_str_size); lit_utf8_byte_t *str_buf_p = (lit_utf8_byte_t*) jsp_mm_alloc (source_str_size);
const lit_utf8_byte_t *source_str_iter_p = source_str_p;
lit_utf8_byte_t *str_buf_iter_p = str_buf_p; lit_utf8_byte_t *str_buf_iter_p = str_buf_p;
/* const lit_utf8_byte_t *source_str_iter_p = source_str_p; */
lit_utf8_iterator_t source_str_iter = lit_utf8_iterator_create (source_str_p, (lit_utf8_size_t) source_str_size);
bool is_correct_sequence = true; bool is_correct_sequence = true;
bool every_char_islower = true; bool every_char_islower = true;
bool every_char_allowed_in_identifier = true; bool every_char_allowed_in_identifier = true;
while (source_str_iter_p < source_str_p + source_str_size) ecma_char_t prev_converted_char = LIT_CHAR_NULL;
while (!lit_utf8_iterator_is_eos (&source_str_iter))
{ {
ecma_char_t converted_char; ecma_char_t converted_char = lit_utf8_iterator_read_next (&source_str_iter);
if (*source_str_iter_p != '\\') if (converted_char == '\\')
{ {
converted_char = (lit_utf8_byte_t) *source_str_iter_p++; const ecma_char_t escape_character = lit_utf8_iterator_read_next (&source_str_iter);
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size);
JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size);
}
else
{
source_str_iter_p++;
const lit_utf8_byte_t escape_character = (lit_utf8_byte_t) *source_str_iter_p++;
JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size);
if (isdigit (escape_character)) if (isdigit (escape_character))
{ {
if (escape_character == '0') if (escape_character == '0')
{ {
JERRY_UNIMPLEMENTED ("<NUL> character is not currently supported.\n"); converted_char = LIT_CHAR_NULL;
} }
else else
{ {
@@ -488,7 +496,7 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
{ {
const uint32_t hex_chars_num = (escape_character == 'u' ? 4u : 2u); const uint32_t hex_chars_num = (escape_character == 'u' ? 4u : 2u);
if (source_str_iter_p + hex_chars_num > source_str_p + source_str_size) if (lit_utf8_iterator_get_offset (&source_str_iter) + hex_chars_num > source_str_size)
{ {
is_correct_sequence = false; is_correct_sequence = false;
break; break;
@@ -499,9 +507,9 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
for (uint32_t i = 0; i < hex_chars_num; i++) for (uint32_t i = 0; i < hex_chars_num; i++)
{ {
const lit_utf8_byte_t byte = (lit_utf8_byte_t) *source_str_iter_p++; const ecma_char_t next_char = lit_utf8_iterator_read_next (&source_str_iter);
if (!isxdigit (byte)) if (!isxdigit (next_char))
{ {
chars_are_hex = false; chars_are_hex = false;
break; break;
@@ -514,12 +522,12 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
JERRY_ASSERT ((char_code & 0xF000u) == 0); JERRY_ASSERT ((char_code & 0xF000u) == 0);
char_code = (uint16_t) (char_code << 4u); char_code = (uint16_t) (char_code << 4u);
char_code = (uint16_t) (char_code + lit_char_hex_to_int (byte)); char_code = (uint16_t) (char_code + lit_char_hex_to_int (next_char));
} }
} }
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size); JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size);
JERRY_ASSERT (source_str_iter_p <= source_str_p + source_str_size); JERRY_ASSERT (lit_utf8_iterator_get_offset (&source_str_iter) <= source_str_size);
if (!chars_are_hex) if (!chars_are_hex)
{ {
@@ -536,14 +544,14 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
} }
else if (lit_char_is_line_terminator (escape_character)) else if (lit_char_is_line_terminator (escape_character))
{ {
if (source_str_iter_p + 1 <= source_str_p + source_str_size) if (str_buf_iter_p + 1 <= source_str_p + source_str_size)
{ {
lit_utf8_byte_t byte = *source_str_iter_p; ecma_char_t next_char = lit_utf8_iterator_peek_next (&source_str_iter);
if (escape_character == '\x0D' if (escape_character == '\x0D'
&& byte == '\x0A') && next_char == '\x0A')
{ {
source_str_iter_p++; lit_utf8_iterator_incr (&source_str_iter);
} }
} }
@@ -555,9 +563,20 @@ convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of
} }
} }
TODO ("Support surrogate paris.") if (lit_is_code_unit_high_surrogate (prev_converted_char)
str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, str_buf_iter_p); && lit_is_code_unit_low_surrogate (converted_char))
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size); {
str_buf_iter_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_converted_char, converted_char);
str_buf_iter_p += lit_code_point_to_utf8 (code_point, str_buf_iter_p);
}
else
{
str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, str_buf_iter_p);
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size);
}
prev_converted_char = converted_char;
if (!islower (converted_char)) if (!islower (converted_char))
{ {
@@ -615,7 +634,7 @@ parse_name (void)
token known_token = empty_token; token known_token = empty_token;
JERRY_ASSERT (isalpha (c) || c == '$' || c == '_'); JERRY_ASSERT (isalpha (c) || c == '$' || c == '_' || c == '\\');
new_token (); new_token ();
@@ -664,9 +683,10 @@ parse_name (void)
} }
} }
const lit_utf8_size_t seq_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
known_token = convert_string_to_token_transform_escape_seq (TOK_NAME, known_token = convert_string_to_token_transform_escape_seq (TOK_NAME,
token_start, token_start,
(size_t) (buffer - token_start)); seq_size);
token_start = NULL; token_start = NULL;
@@ -722,10 +742,11 @@ parse_number (void)
if (isalpha (c) || c == '_' || c == '$') if (isalpha (c) || c == '_' || c == '$')
{ {
PARSE_ERROR ("Integer literal shall not contain non-digit characters", buffer - buffer_start); PARSE_ERROR ("Integer literal shall not contain non-digit characters",
lit_utf8_iterator_get_offset (&src_iter));
} }
tok_length = (size_t) (buffer - token_start); tok_length = (size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
for (i = 0; i < tok_length; i++) for (i = 0; i < tok_length; i++)
{ {
@@ -776,12 +797,13 @@ parse_number (void)
if (is_fp && c == '.') if (is_fp && c == '.')
{ {
FIXME (/* This is wrong: 1..toString (). */) FIXME (/* This is wrong: 1..toString (). */)
PARSE_ERROR ("Integer literal shall not contain more than one dot character", buffer - buffer_start); PARSE_ERROR ("Integer literal shall not contain more than one dot character",
lit_utf8_iterator_get_offset (&src_iter));
} }
if (is_exp && (c == 'e' || c == 'E')) if (is_exp && (c == 'e' || c == 'E'))
{ {
PARSE_ERROR ("Integer literal shall not contain more than exponential marker ('e' or 'E')", PARSE_ERROR ("Integer literal shall not contain more than exponential marker ('e' or 'E')",
buffer - buffer_start); lit_utf8_iterator_get_offset (&src_iter));
} }
if (c == '.') if (c == '.')
@@ -789,7 +811,7 @@ parse_number (void)
if (isalpha (LA (1)) || LA (1) == '_' || LA (1) == '$') if (isalpha (LA (1)) || LA (1) == '_' || LA (1) == '$')
{ {
PARSE_ERROR ("Integer literal shall not contain non-digit character after got character", PARSE_ERROR ("Integer literal shall not contain non-digit character after got character",
buffer - buffer_start); lit_utf8_iterator_get_offset (&src_iter));
} }
is_fp = true; is_fp = true;
consume_char (); consume_char ();
@@ -805,7 +827,7 @@ parse_number (void)
if (!isdigit (LA (1))) if (!isdigit (LA (1)))
{ {
PARSE_ERROR ("Integer literal shall not contain non-digit character after exponential marker ('e' or 'E')", PARSE_ERROR ("Integer literal shall not contain non-digit character after exponential marker ('e' or 'E')",
buffer - buffer_start); lit_utf8_iterator_get_offset (&src_iter));
} }
is_exp = true; is_exp = true;
consume_char (); consume_char ();
@@ -814,7 +836,8 @@ parse_number (void)
if (isalpha (c) || c == '_' || c == '$') if (isalpha (c) || c == '_' || c == '$')
{ {
PARSE_ERROR ("Integer literal shall not contain non-digit characters", buffer - buffer_start); PARSE_ERROR ("Integer literal shall not contain non-digit characters",
lit_utf8_iterator_get_offset (&src_iter));
} }
if (!isdigit (c)) if (!isdigit (c))
@@ -825,7 +848,7 @@ parse_number (void)
consume_char (); consume_char ();
} }
tok_length = (size_t) (buffer - token_start); tok_length = (size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
if (is_fp || is_exp) if (is_fp || is_exp)
{ {
ecma_number_t res = ecma_utf8_string_to_number (token_start, (jerry_api_size_t) tok_length); ecma_number_t res = ecma_utf8_string_to_number (token_start, (jerry_api_size_t) tok_length);
@@ -948,9 +971,12 @@ parse_string (void)
} }
while (c != end_char); while (c != end_char);
const lit_utf8_size_t esc_seq_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
token_start) - 1;
token ret = convert_string_to_token_transform_escape_seq (TOK_STRING, token ret = convert_string_to_token_transform_escape_seq (TOK_STRING,
token_start, token_start,
(size_t) (buffer - token_start) - 1u); esc_seq_size);
token_start = NULL; token_start = NULL;
@@ -1021,7 +1047,8 @@ parse_regexp (void)
result = convert_string_to_token (TOK_REGEXP, result = convert_string_to_token (TOK_REGEXP,
(const lit_utf8_byte_t *) token_start, (const lit_utf8_byte_t *) token_start,
static_cast<ecma_length_t> (buffer - token_start)); (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
token_start));
token_start = NULL; token_start = NULL;
return result; return result;
@@ -1039,13 +1066,6 @@ grobble_whitespaces (void)
} }
} }
static void
lexer_set_source (const jerry_api_char_t * source)
{
buffer_start = source;
buffer = buffer_start;
}
static bool static bool
replace_comment_by_newline (void) replace_comment_by_newline (void)
{ {
@@ -1088,7 +1108,7 @@ replace_comment_by_newline (void)
} }
if (multiline && c == '\0') if (multiline && c == '\0')
{ {
PARSE_ERROR ("Unclosed multiline comment", buffer - buffer_start); PARSE_ERROR ("Unclosed multiline comment", lit_utf8_iterator_get_offset (&src_iter));
} }
consume_char (); consume_char ();
} }
@@ -1101,7 +1121,7 @@ lexer_next_token_private (void)
JERRY_ASSERT (token_start == NULL); JERRY_ASSERT (token_start == NULL);
if (isalpha (c) || c == '$' || c == '_') if (isalpha (c) || c == '$' || c == '_' || c == '\\')
{ {
return parse_name (); return parse_name ();
} }
@@ -1251,15 +1271,15 @@ lexer_next_token_private (void)
} }
break; break;
} }
default: PARSE_SORRY ("Unknown character", buffer - buffer_start); default: PARSE_SORRY ("Unknown character", lit_utf8_iterator_get_offset (&src_iter));
} }
PARSE_SORRY ("Unknown character", buffer - buffer_start); PARSE_SORRY ("Unknown character", lit_utf8_iterator_get_offset (&src_iter));
} }
token token
lexer_next_token (void) lexer_next_token (void)
{ {
if (buffer == buffer_start) if (lit_utf8_iterator_get_offset (&src_iter) == 0)
{ {
dump_current_line (); dump_current_line ();
} }
@@ -1281,7 +1301,7 @@ lexer_next_token (void)
if (prev_token.type == TOK_EOF if (prev_token.type == TOK_EOF
&& sent_token.type == TOK_EOF) && sent_token.type == TOK_EOF)
{ {
PARSE_ERROR ("Unexpected EOF", buffer - buffer_start); PARSE_ERROR ("Unexpected EOF", lit_utf8_iterator_get_offset (&src_iter));
} }
prev_token = sent_token; prev_token = sent_token;
@@ -1315,7 +1335,7 @@ lexer_seek (size_t locus)
JERRY_ASSERT (locus < buffer_size); JERRY_ASSERT (locus < buffer_size);
JERRY_ASSERT (token_start == NULL); JERRY_ASSERT (token_start == NULL);
buffer = buffer_start + locus; lit_utf8_iterator_set_offset (&src_iter, (lit_utf8_size_t) locus);
saved_token = empty_token; saved_token = empty_token;
} }
@@ -1543,8 +1563,17 @@ lexer_init (const jerry_api_char_t *source, /**< script source */
saved_token = prev_token = sent_token = empty_token; saved_token = prev_token = sent_token = empty_token;
if (!lit_is_utf8_string_valid (source, (lit_utf8_size_t) source_size))
{
PARSE_ERROR ("Invalid source encoding", 0);
}
src_iter = lit_utf8_iterator_create (source, (lit_utf8_size_t) source_size);
buffer_size = source_size; buffer_size = source_size;
lexer_set_source (source); buffer_start = source;
token_start = NULL;
lexer_set_strict_mode (false); lexer_set_strict_mode (false);
#ifndef JERRY_NDEBUG #ifndef JERRY_NDEBUG