Files
jerryscript/src/libjsparser/lexer.c
T
2014-07-03 16:23:25 +04:00

797 lines
16 KiB
C

/* Copyright 2014 Samsung Electronics Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "error.h"
#include "lexer.h"
static token saved_token;
#ifdef DEBUG
FILE *lexer_debug_log;
#endif
/* If TOKEN represents a keyword, return decoded keyword,
if TOKEN represents a Future Reserved Word, return KW_RESERVED,
otherwise return KW_NONE. */
static keyword
decode_keyword (const char *tok)
{
assert (tok);
if (!strcmp ("break", tok))
return KW_BREAK;
else if (!strcmp ("case", tok))
return KW_CASE;
else if (!strcmp ("catch", tok))
return KW_CATCH;
else if (!strcmp ("continue", tok))
return KW_CONTINUE;
else if (!strcmp ("debugger", tok))
return KW_DEBUGGER;
else if (!strcmp ("default", tok))
return KW_DEFAULT;
else if (!strcmp ("delete", tok))
return KW_DELETE;
else if (!strcmp ("do", tok))
return KW_DO;
else if (!strcmp ("else", tok))
return KW_ELSE;
else if (!strcmp ("finally", tok))
return KW_FINALLY;
else if (!strcmp ("for", tok))
return KW_FOR;
else if (!strcmp ("function", tok))
return KW_FUNCTION;
else if (!strcmp ("if", tok))
return KW_IF;
else if (!strcmp ("in", tok))
return KW_IN;
else if (!strcmp ("instanceof", tok))
return KW_INSTANCEOF;
else if (!strcmp ("new", tok))
return KW_NEW;
else if (!strcmp ("return", tok))
return KW_RETURN;
else if (!strcmp ("switch", tok))
return KW_SWITCH;
else if (!strcmp ("this", tok))
return KW_THIS;
else if (!strcmp ("throw", tok))
return KW_THROW;
else if (!strcmp ("try", tok))
return KW_TRY;
else if (!strcmp ("typeof", tok))
return KW_TYPEOF;
else if (!strcmp ("var", tok))
return KW_VAR;
else if (!strcmp ("void", tok))
return KW_VOID;
else if (!strcmp ("while", tok))
return KW_WHILE;
else if (!strcmp ("with", tok))
return KW_WITH;
else if (!strcmp ("class", tok) || !strcmp ("const", tok)
|| !strcmp ("enum", tok) || !strcmp ("export", tok)
|| !strcmp ("extends", tok) || !strcmp ("import", tok)
|| !strcmp ("super", tok) || !strcmp ("implements", tok)
|| !strcmp ("interface", tok) || !strcmp ("let", tok)
|| !strcmp ("package", tok) || !strcmp ("private", tok)
|| !strcmp ("protected", tok) || !strcmp ("public", tok)
|| !strcmp ("static", tok) || !strcmp ("yield", tok))
return KW_RESERVED;
else
return KW_NONE;
}
static FILE *file;
/* Represents the contents of a file. */
static char *buffer = NULL;
static char *buffer_start;
static char *token_start;
#define BUFFER_SIZE 1024
static char
get_char (int i)
{
int error;
const int tail_size = BUFFER_SIZE - (buffer - buffer_start);
assert (file);
if (buffer == NULL)
{
buffer = (char *) malloc (BUFFER_SIZE);
error = fread (buffer, 1, BUFFER_SIZE, file);
if (error < 0)
fatal (ERR_IO);
if (error == 0)
return '\0';
if (error < BUFFER_SIZE)
memset (buffer + error, '\0', BUFFER_SIZE - error);
buffer_start = buffer;
}
if (tail_size <= i)
{
/* We are almost at the end of the buffer. */
if (token_start)
{
const int token_size = buffer - token_start;
/* Whole buffer contains single token. */
if (token_start == buffer_start)
fatal (ERR_BUFFER_SIZE);
/* Move parsed token and tail of buffer to head. */
memmove (buffer_start, token_start, tail_size + token_size);
/* Adjust pointers. */
token_start = buffer_start;
buffer = buffer_start + token_size;
/* Read more characters form input file. */
error = fread (buffer + tail_size, 1, BUFFER_SIZE - tail_size - token_size, file);
if (error < 0)
fatal (ERR_IO);
if (error == 0)
return '\0';
if (error < BUFFER_SIZE - tail_size - token_size)
memset (buffer + tail_size + error, '\0',
BUFFER_SIZE - tail_size - token_size - error);
}
else
{
memmove (buffer_start, buffer, tail_size);
buffer = buffer_start;
error = fread (buffer + tail_size, 1, BUFFER_SIZE - tail_size, file);
if (error < 0)
fatal (ERR_IO);
if (error == 0)
return '\0';
if (error < BUFFER_SIZE - tail_size)
memset (buffer + tail_size + error, '\0', BUFFER_SIZE - tail_size - error);
}
}
return *(buffer + i);
}
#define LA(I) (get_char (I))
static inline void
new_token ()
{
assert (buffer);
token_start = buffer;
}
static inline void
consume_char ()
{
assert (buffer);
buffer++;
}
static inline const char *
current_token ()
{
assert (buffer);
assert (token_start);
int length = buffer - token_start;
char *res = (char *) malloc (length + 1);
strncpy (res, token_start, length);
res[length] = '\0';
token_start = NULL;
return res;
}
#define RETURN_PUNC_EX(TOK, NUM) \
do \
{ \
buffer += NUM; \
return (token) { .type = TOK, .data.none = NULL }; \
} \
while (0)
#define RETURN_PUNC(TOK) RETURN_PUNC_EX(TOK, 1)
#define IF_LA_N_IS(CHAR, THEN_TOK, ELSE_TOK, NUM) \
do \
{ \
if (LA (NUM) == CHAR) \
RETURN_PUNC_EX (THEN_TOK, NUM + 1); \
else \
RETURN_PUNC_EX (ELSE_TOK, NUM); \
} \
while (0)
#define IF_LA_IS(CHAR, THEN_TOK, ELSE_TOK) \
IF_LA_N_IS (CHAR, THEN_TOK, ELSE_TOK, 1)
#define IF_LA_IS_OR(CHAR1, THEN1_TOK, CHAR2, THEN2_TOK, ELSE_TOK) \
do \
{ \
if (LA (1) == CHAR1) \
RETURN_PUNC_EX (THEN1_TOK, 2); \
else if (LA (1) == CHAR2) \
RETURN_PUNC_EX (THEN2_TOK, 2); \
else \
RETURN_PUNC (ELSE_TOK); \
} \
while (0)
static token
parse_name ()
{
char c = LA (0);
bool every_char_islower = isalpha (c) && islower (c);
const char *tok = NULL;
assert (isalpha (c) || c == '$' || c == '_');
new_token ();
consume_char ();
while (true)
{
c = LA (0);
if (c == '\0')
c = c;
if (!isalpha (c) && !isdigit (c) && c != '$' && c != '_')
break;
if (every_char_islower && (!isalpha (c) || !islower (c)))
every_char_islower = false;
consume_char ();
}
tok = current_token ();
if (every_char_islower)
{
keyword kw = decode_keyword (tok);
if (kw != KW_NONE)
{
free ((char *) tok);
return (token)
{
.type = TOK_KEYWORD, .data.kw = kw
};
}
if (!strcmp ("null", tok))
{
free ((char *) tok);
return (token)
{
.type = TOK_NULL, .data.none = NULL
};
}
if (!strcmp ("true", tok))
{
free ((char *) tok);
return (token)
{
.type = TOK_BOOL, .data.is_true = true
};
}
if (!strcmp ("false", tok))
{
free ((char *) tok);
return (token)
{
.type = TOK_BOOL, .data.is_true = false
};
}
}
return (token)
{
.type = TOK_NAME, .data.name = tok
};
}
static bool
is_hex_digit (char c)
{
return isdigit (c) || c == 'a' || c == 'A' || c == 'b' || c == 'B'
|| c == 'c' || c == 'C' || c == 'd' || c == 'D'
|| c == 'e' || c == 'E' || c == 'f' || c == 'F';
}
static int
hex_to_int (char hex)
{
switch (hex)
{
case '0': return 0x0;
case '1': return 0x1;
case '2': return 0x2;
case '3': return 0x3;
case '4': return 0x4;
case '5': return 0x5;
case '6': return 0x6;
case '7': return 0x7;
case '8': return 0x8;
case '9': return 0x9;
case 'a':
case 'A': return 0xA;
case 'b':
case 'B': return 0xB;
case 'c':
case 'C': return 0xC;
case 'd':
case 'D': return 0xD;
case 'e':
case 'E': return 0xE;
case 'f':
case 'F': return 0xF;
default: unreachable ();
}
}
/* In this function we cannot use strtol function
since there is no octal literals in ECMAscript. */
static token
parse_number ()
{
char c = LA (0);
bool is_hex = false;
bool is_fp = false;
bool is_exp = false;
const char *tok = NULL;
int tok_length = 0;
int res = 0;
assert (isdigit (c) || c == '.');
if (c == '0')
if (LA (1) == 'x' || LA (1) == 'X')
is_hex = true;
if (c == '.')
{
assert (!isalpha (LA (1)));
is_fp = true;
}
if (is_hex)
{
// Eat up '0x'
consume_char ();
consume_char ();
new_token ();
while (true)
{
c = LA (0);
if (!is_hex_digit (c))
break;
consume_char ();
}
if (isalpha (c) || c == '_' || c == '$')
fatal (ERR_INT_LITERAL);
tok_length = buffer - token_start;
tok = current_token ();
// OK, I know that integer overflow can occur here
for (int i = 0; i < tok_length; i++)
res = (res << 4) + hex_to_int (tok[i]);
free ((char *) tok);
return (token)
{
.type = TOK_INT, .data.num = res
};
}
assert (!is_hex && !is_exp);
new_token ();
// Eat up '.'
if (is_fp)
consume_char ();
while (true)
{
c = LA (0);
if (is_fp && c == '.')
fatal (ERR_INT_LITERAL);
if (is_exp && (c == 'e' || c == 'E'))
fatal (ERR_INT_LITERAL);
if (c == '.')
{
if (isalpha (LA (1)) || LA (1) == '_' || LA (1) == '$')
fatal (ERR_INT_LITERAL);
is_fp = true;
consume_char ();
continue;
}
if (c == 'e' || c == 'E')
{
if (LA (1) == '-' || LA (1) == '+')
consume_char ();
if (!isdigit (LA (1)))
fatal (ERR_INT_LITERAL);
is_exp = true;
consume_char ();
continue;
}
if (isalpha (c) || c == '_' || c == '$')
fatal (ERR_INT_LITERAL);
if (!isdigit (c))
break;
consume_char ();
}
if (is_fp || is_exp)
{
tok = current_token ();
float res = strtof (tok, NULL);
free ((char *) tok);
return (token)
{
.type = TOK_FLOAT, .data.fp_num = res
};
}
tok_length = buffer - token_start;
tok = current_token ();
for (int i = 0; i < tok_length; i++)
res = res * 10 + hex_to_int (tok[i]);
free ((char *) tok);
return (token)
{
.type = TOK_INT, .data.num = res
};
}
static char
escape_char (char c)
{
switch (c)
{
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
case '\'':
case '"':
case '\\':
default: return c;
}
}
static token
parse_string ()
{
char c = LA (0);
bool is_double_quoted;
char *tok = NULL;
char *index = NULL;
int length;
assert (c == '\'' || c == '"');
is_double_quoted = (c == '"');
// Eat up '"'
consume_char ();
new_token ();
while (true)
{
c = LA (0);
if (c == '\0')
fatal (ERR_UNCLOSED);
if (c == '\n')
fatal (ERR_STRING);
if (c == '\\')
{
/* Only single escape character is allowed. */
if (LA (1) == 'x' || LA (1) == 'u' || isdigit (LA (1)))
fatal (ERR_STRING);
if ((LA (1) == '\'' && !is_double_quoted)
|| (LA (1) == '"' && is_double_quoted)
|| LA (1) == '\n')
{
consume_char ();
consume_char ();
continue;
}
}
else if ((c == '\'' && !is_double_quoted)
|| (c == '"' && is_double_quoted))
break;
consume_char ();
}
length = buffer - token_start;
tok = (char *) malloc (length);
index = tok;
for (char *i = token_start; i < buffer; i++)
{
if (*i == '\\')
{
if (*(i + 1) == '\n')
{
i++;
continue;
}
*index = escape_char (*(i + 1));
index++;
i++;
continue;
}
*index = *i;
index++;
}
memset (index, '\0', length - (index - tok));
token_start = NULL;
// Eat up '"'
consume_char ();
return (token)
{
.type = TOK_STRING, .data.str = tok
};
}
static void
grobble_whitespaces ()
{
char c = LA (0);
while ((isspace (c) && c != '\n') || c == '\0')
{
consume_char ();
c = LA (0);
}
}
void
lexer_set_file (FILE *ex_file)
{
assert (ex_file);
file = ex_file;
#ifdef DEBUG
lexer_debug_log = fopen ("lexer.log", "w");
#endif
}
static bool
replace_comment_by_newline ()
{
char c = LA (0);
bool multiline;
bool was_newlines = false;
assert (LA (0) == '/');
assert (LA (1) == '/' || LA (1) == '*');
multiline = (LA (1) == '*');
consume_char ();
consume_char ();
while (true)
{
c = LA (0);
if (!multiline && (c == '\n' || c == '\0'))
return false;
if (multiline && c == '*' && LA (1) == '/')
{
consume_char ();
consume_char ();
if (was_newlines)
return true;
else
return false;
}
if (multiline && c == '\n')
was_newlines = true;
if (multiline && c == '\0')
fatal (ERR_UNCLOSED);
consume_char ();
}
}
token
#ifdef DEBUG
lexer_next_token_private ()
#else
lexer_next_token ()
#endif
{
char c = LA (0);
if (saved_token.type != TOK_EOF)
{
token res = saved_token;
saved_token.type = TOK_EOF;
return res;
}
assert (token_start == NULL);
if (isalpha (c) || c == '$' || c == '_')
return parse_name ();
if (isdigit (c) || (c == '.' && isdigit (LA (1))))
return parse_number ();
if (c == '\n')
{
consume_char ();
return (token)
{
.type = TOK_NEWLINE, .data.none = NULL
};
}
if (c == '\0')
return (token)
{
.type = TOK_EOF, .data.none = NULL
};
if (c == '\'' || c == '"')
return parse_string ();
if (isspace (c))
{
grobble_whitespaces ();
return lexer_next_token ();
}
if (c == '/' && LA (1) == '*')
{
if (replace_comment_by_newline ())
return (token)
{
.type = TOK_NEWLINE, .data.none = NULL
};
else
return lexer_next_token ();
}
if (c == '/' && LA (1) == '/')
{
replace_comment_by_newline ();
return lexer_next_token ();
}
switch (c)
{
case '{': RETURN_PUNC (TOK_OPEN_BRACE);
case '}': RETURN_PUNC (TOK_CLOSE_BRACE);
case '(': RETURN_PUNC (TOK_OPEN_PAREN);
case ')': RETURN_PUNC (TOK_CLOSE_PAREN);
case '[': RETURN_PUNC (TOK_OPEN_SQUARE);
case ']': RETURN_PUNC (TOK_CLOSE_SQUARE);
case '.': RETURN_PUNC (TOK_DOT);
case ';': RETURN_PUNC (TOK_SEMICOLON);
case ',': RETURN_PUNC (TOK_COMMA);
case '~': RETURN_PUNC (TOK_COMPL);
case ':': RETURN_PUNC (TOK_COLON);
case '?': RETURN_PUNC (TOK_QUERY);
case '*': IF_LA_IS ('=', TOK_MULT_EQ, TOK_MULT);
case '/': IF_LA_IS ('=', TOK_DIV_EQ, TOK_DIV);
case '^': IF_LA_IS ('=', TOK_XOR_EQ, TOK_XOR);
case '%': IF_LA_IS ('=', TOK_MOD_EQ, TOK_MOD);
case '+': IF_LA_IS_OR ('+', TOK_DOUBLE_PLUS, '=', TOK_PLUS_EQ, TOK_PLUS);
case '-': IF_LA_IS_OR ('-', TOK_DOUBLE_MINUS, '=', TOK_MINUS_EQ, TOK_MINUS);
case '&': IF_LA_IS_OR ('&', TOK_DOUBLE_AND, '=', TOK_AND_EQ, TOK_AND);
case '|': IF_LA_IS_OR ('|', TOK_DOUBLE_OR, '=', TOK_OR_EQ, TOK_OR);
case '<':
switch (LA (1))
{
case '<': IF_LA_N_IS ('=', TOK_LSHIFT_EQ, TOK_LSHIFT, 2);
case '=': RETURN_PUNC_EX (TOK_LESS_EQ, 2);
default: RETURN_PUNC (TOK_LESS);
}
case '>':
switch (LA (1))
{
case '>':
switch (LA (2))
{
case '>': IF_LA_N_IS ('=', TOK_RSHIFT_EX_EQ, TOK_RSHIFT_EX, 3);
case '=': RETURN_PUNC_EX (TOK_RSHIFT_EQ, 3);
default: RETURN_PUNC_EX (TOK_RSHIFT, 2);
}
case '=': RETURN_PUNC_EX (TOK_GREATER_EQ, 2);
default: RETURN_PUNC (TOK_GREATER);
}
case '=':
if (LA (1) == '=')
IF_LA_N_IS ('=', TOK_TRIPLE_EQ, TOK_DOUBLE_EQ, 2);
else
RETURN_PUNC (TOK_EQ);
case '!':
if (LA (1) == '=')
IF_LA_N_IS ('=', TOK_NOT_DOUBLE_EQ, TOK_NOT_EQ, 2);
else
RETURN_PUNC (TOK_NOT);
default:
unreachable ();
}
fatal (ERR_NON_CHAR);
}
#ifdef DEBUG
static int i = 0;
token
lexer_next_token ()
{
token tok = lexer_next_token_private ();
if (tok.type == TOK_NEWLINE)
return tok;
if (tok.type == TOK_CLOSE_BRACE)
{
if (i == 300)
fprintf (lexer_debug_log, "lexer_next_token(%d): type=0x%x, data=%p\n", i, tok.type, tok.data.none);
i++;
}
return tok;
}
#endif
void
lexer_save_token (token tok)
{
#ifdef DEBUG
if (tok.type == TOK_CLOSE_BRACE)
fprintf (lexer_debug_log, "lexer_save_token(%d): type=0x%x, data=%p\n", i, tok.type, tok.data.none);
#endif
saved_token = tok;
}
void
lexer_dump_buffer_state ()
{
printf ("%s\n", buffer);
}