/* Copyright 2014 Samsung Electronics Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "mem-allocator.h" #include "globals.h" #include "jerry-libc.h" #include "lexer.h" #include "parser.h" static token saved_token; static token empty_token = { .type = TOK_EMPTY, .data.uid = 0 }; typedef struct { const char *str; token tok; } string_and_token; static string_and_token keyword_tokens[] = { { .str = "break", .tok = { .type = TOK_KEYWORD, .data.kw = KW_BREAK } }, { .str = "case", .tok = { .type = TOK_KEYWORD, .data.kw = KW_CASE } }, { .str = "catch", .tok = { .type = TOK_KEYWORD, .data.kw = KW_CATCH } }, { .str = "class", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "const", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "continue", .tok = { .type = TOK_KEYWORD, .data.kw = KW_CONTINUE } }, { .str = "debugger", .tok = { .type = TOK_KEYWORD, .data.kw = KW_DEBUGGER } }, { .str = "default", .tok = { .type = TOK_KEYWORD, .data.kw = KW_DEFAULT } }, { .str = "delete", .tok = { .type = TOK_KEYWORD, .data.kw = KW_DELETE } }, { .str = "do", .tok = { .type = TOK_KEYWORD, .data.kw = KW_DO } }, { .str = "else", .tok = { .type = TOK_KEYWORD, .data.kw = KW_ELSE } }, { .str = "enum", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "export", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "extends", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "false", .tok = { .type = TOK_BOOL, .data.uid = false } }, { .str = "finally", .tok = { .type = TOK_KEYWORD, .data.kw = KW_FINALLY } }, { .str = "for", .tok = { .type = TOK_KEYWORD, .data.kw = KW_FOR } }, { .str = "function", .tok = { .type = TOK_KEYWORD, .data.kw = KW_FUNCTION } }, { .str = "if", .tok = { .type = TOK_KEYWORD, .data.kw = KW_IF } }, { .str = "instanceof", .tok = { .type = TOK_KEYWORD, .data.kw = KW_INSTANCEOF } }, { .str = "interface", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "in", .tok = { .type = TOK_KEYWORD, .data.kw = KW_IN } }, { .str = "import", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "implements", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "let", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "new", .tok = { .type = TOK_KEYWORD, .data.kw = KW_NEW } }, { .str = "null", .tok = { .type = TOK_NULL, .data.uid = 0 } }, { .str = "package", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "private", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "protected", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "public", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "return", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RETURN } }, { .str = "static", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "super", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } }, { .str = "switch", .tok = { .type = TOK_KEYWORD, .data.kw = KW_SWITCH } }, { .str = "this", .tok = { .type = TOK_KEYWORD, .data.kw = KW_THIS } }, { .str = "throw", .tok = { .type = TOK_KEYWORD, .data.kw = KW_THROW } }, { .str = "true", .tok = { .type = TOK_BOOL, .data.uid = true } }, { .str = "try", .tok = { .type = TOK_KEYWORD, .data.kw = KW_TRY } }, { .str = "typeof", .tok = { .type = TOK_KEYWORD, .data.kw = KW_TYPEOF } }, { .str = "var", .tok = { .type = TOK_KEYWORD, .data.kw = KW_VAR } }, { .str = "void", .tok = { .type = TOK_KEYWORD, .data.kw = KW_VOID } }, { .str = "while", .tok = { .type = TOK_KEYWORD, .data.kw = KW_WHILE } }, { .str = "with", .tok = { .type = TOK_KEYWORD, .data.kw = KW_WITH } }, { .str = "yield", .tok = { .type = TOK_KEYWORD, .data.kw = KW_RESERVED } } }; typedef struct { int num; token tok; } num_and_token; #define MAX_NAMES 100 #define MAX_NUMS 25 static string_and_token seen_names[MAX_NAMES]; static uint8_t seen_names_count = 0; static num_and_token seen_nums[MAX_NAMES] = { [0] = { .num = 0, .tok = { .type = TOK_INT, .data.uid = 0 } }, [1] = { .num = 1, .tok = { .type = TOK_INT, .data.uid = 1 } } }; static uint8_t seen_nums_count = 2; static bool is_empty (token tok) { return tok.type == TOK_EMPTY; } #ifdef __HOST FILE *lexer_debug_log; #endif #ifdef __HOST static FILE *file; static char *buffer_start; /* Represents the contents of a file. */ static char *buffer = NULL; static char *token_start; #define BUFFER_SIZE 1024 static char get_char (size_t i) { size_t error; JERRY_ASSERT (buffer >= buffer_start); const size_t tail_size = BUFFER_SIZE - (size_t) (buffer - buffer_start); JERRY_ASSERT (file); if (buffer == NULL) { buffer = (char *) mem_heap_alloc_block (BUFFER_SIZE, MEM_HEAP_ALLOC_SHORT_TERM); error = __fread (buffer, 1, BUFFER_SIZE, file); if (error == 0) return '\0'; if (error < BUFFER_SIZE) __memset (buffer + error, '\0', BUFFER_SIZE - error); buffer_start = buffer; } if (tail_size <= i) { /* We are almost at the end of the buffer. */ if (token_start) { JERRY_ASSERT (buffer >= token_start); const size_t token_size = (size_t) (buffer - token_start); /* Whole buffer contains single token. */ if (token_start == buffer_start) parser_fatal (ERR_BUFFER_SIZE); /* Move parsed token and tail of buffer to head. */ __memmove (buffer_start, token_start, tail_size + token_size); /* Adjust pointers. */ token_start = buffer_start; buffer = buffer_start + token_size; /* Read more characters form input file. */ error = __fread (buffer + tail_size, 1, BUFFER_SIZE - tail_size - token_size, file); if (error == 0) return '\0'; if (error < BUFFER_SIZE - tail_size - token_size) __memset (buffer + tail_size + error, '\0', BUFFER_SIZE - tail_size - token_size - error); } else { __memmove (buffer_start, buffer, tail_size); buffer = buffer_start; error = __fread (buffer + tail_size, 1, BUFFER_SIZE - tail_size, file); if (error == 0) return '\0'; if (error < BUFFER_SIZE - tail_size) __memset (buffer + tail_size + error, '\0', BUFFER_SIZE - tail_size - error); } } return *(buffer + i); } #define LA(I) (get_char (I)) #else /* Represents the contents of a file. */ static const char *buffer = NULL; static const char *token_start; #define LA(I) (*(buffer + I)) #endif // __HOST /* If TOKEN represents a keyword, return decoded keyword, if TOKEN represents a Future Reserved Word, return KW_RESERVED, otherwise return KW_NONE. */ static token decode_keyword (void) { size_t size = sizeof (keyword_tokens) / sizeof (string_and_token); size_t i; for (i = 0; i < size; i++) { if (!__strncmp (keyword_tokens[i].str, token_start, (size_t) (buffer - token_start))) return keyword_tokens[i].tok; } return empty_token; } static token convert_seen_name_to_token (void) { size_t i; for (i = 0; i < seen_names_count; i++) { if (!__strncmp (seen_names[i].str, token_start, (size_t) (buffer - token_start))) return seen_names[i].tok; } return empty_token; } static void add_name_to_seen_tokens (string_and_token snt) { JERRY_ASSERT (seen_names_count < MAX_NAMES); seen_names[seen_names_count++] = snt; } static token convert_seen_num_to_token (int num) { size_t i; for (i = 0; i < seen_nums_count; i++) { if (seen_nums[i].num == num) return seen_nums[i].tok; } return empty_token; } static void add_num_to_seen_tokens (num_and_token nat) { JERRY_ASSERT (seen_nums_count < MAX_NUMS); seen_nums[seen_nums_count++] = nat; } uint8_t lexer_get_strings (const char **strings) { if (strings) { int i; for (i = 0; i < seen_names_count; i++) { strings[i] = seen_names[i].str; } } return seen_names_count; } const char * lexer_get_string_by_id (uint8_t id) { JERRY_ASSERT (id < seen_names_count); return seen_names[id].str; } uint8_t lexer_get_nums (int *nums) { int i; for (i = 0; i < seen_nums_count; i++) { nums[i] = seen_nums[i].num; } return seen_nums_count; } static void new_token (void) { JERRY_ASSERT (buffer); token_start = buffer; } static void consume_char (void) { JERRY_ASSERT (buffer); buffer++; } static const char * current_token (void) { JERRY_ASSERT (buffer); JERRY_ASSERT (token_start); JERRY_ASSERT (token_start <= buffer); size_t length = (size_t) (buffer - token_start); char *res = (char *) mem_heap_alloc_block (length + 1, MEM_HEAP_ALLOC_SHORT_TERM); __strncpy (res, token_start, length); res[length] = '\0'; token_start = NULL; return res; } #define RETURN_PUNC_EX(TOK, NUM) \ do \ { \ buffer += NUM; \ return (token) { .type = TOK, .data.uid = 0 }; \ } \ while (0) #define RETURN_PUNC(TOK) RETURN_PUNC_EX(TOK, 1) #define IF_LA_N_IS(CHAR, THEN_TOK, ELSE_TOK, NUM) \ do \ { \ if (LA (NUM) == CHAR) \ RETURN_PUNC_EX (THEN_TOK, NUM + 1); \ else \ RETURN_PUNC_EX (ELSE_TOK, NUM); \ } \ while (0) #define IF_LA_IS(CHAR, THEN_TOK, ELSE_TOK) \ IF_LA_N_IS (CHAR, THEN_TOK, ELSE_TOK, 1) #define IF_LA_IS_OR(CHAR1, THEN1_TOK, CHAR2, THEN2_TOK, ELSE_TOK) \ do \ { \ if (LA (1) == CHAR1) \ RETURN_PUNC_EX (THEN1_TOK, 2); \ else if (LA (1) == CHAR2) \ RETURN_PUNC_EX (THEN2_TOK, 2); \ else \ RETURN_PUNC (ELSE_TOK); \ } \ while (0) static token parse_name (void) { char c = LA (0); bool every_char_islower = __islower (c); const char *string = NULL; token known_token = empty_token; JERRY_ASSERT (__isalpha (c) || c == '$' || c == '_'); new_token (); consume_char (); while (true) { c = LA (0); if (c == '\0') break; if (!__isalpha (c) && !__isdigit (c) && c != '$' && c != '_') break; if (every_char_islower && (!__islower (c))) every_char_islower = false; consume_char (); } if (every_char_islower) { known_token = decode_keyword (); if (!is_empty (known_token)) { token_start = NULL; return known_token; } } known_token = convert_seen_name_to_token (); if (!is_empty (known_token)) { token_start = NULL; return known_token; } string = current_token (); known_token = (token) { .type = TOK_NAME, .data.uid = seen_names_count }; add_name_to_seen_tokens ((string_and_token) { .str = string, .tok = known_token }); return known_token; } static int hex_to_int (char hex) { switch (hex) { case '0': return 0x0; case '1': return 0x1; case '2': return 0x2; case '3': return 0x3; case '4': return 0x4; case '5': return 0x5; case '6': return 0x6; case '7': return 0x7; case '8': return 0x8; case '9': return 0x9; case 'a': case 'A': return 0xA; case 'b': case 'B': return 0xB; case 'c': case 'C': return 0xC; case 'd': case 'D': return 0xD; case 'e': case 'E': return 0xE; case 'f': case 'F': return 0xF; default: JERRY_UNREACHABLE (); } } /* In this function we cannot use strtol function since there is no octal literals in ECMAscript. */ static token parse_number (void) { char c = LA (0); bool is_hex = false; bool is_fp = false; bool is_exp = false; size_t tok_length = 0, i; int res = 0; token known_token; JERRY_ASSERT (__isdigit (c) || c == '.'); if (c == '0') if (LA (1) == 'x' || LA (1) == 'X') is_hex = true; if (c == '.') { JERRY_ASSERT (!__isalpha (LA (1))); is_fp = true; } if (is_hex) { // Eat up '0x' consume_char (); consume_char (); new_token (); while (true) { c = LA (0); if (!__isxdigit (c)) break; consume_char (); } if (__isalpha (c) || c == '_' || c == '$') parser_fatal (ERR_INT_LITERAL); tok_length = (size_t) (buffer - token_start); // OK, I know that integer overflow can occur here for (i = 0; i < tok_length; i++) res = (res << 4) + hex_to_int (token_start[i]); token_start = NULL; known_token = convert_seen_num_to_token (res); if (!is_empty (known_token)) return known_token; known_token = (token) { .type = TOK_INT, .data.uid = seen_nums_count }; add_num_to_seen_tokens ((num_and_token) { .num = res, .tok = known_token }); return known_token; } JERRY_ASSERT (!is_hex && !is_exp); new_token (); // Eat up '.' if (is_fp) consume_char (); while (true) { c = LA (0); if (is_fp && c == '.') parser_fatal (ERR_INT_LITERAL); if (is_exp && (c == 'e' || c == 'E')) parser_fatal (ERR_INT_LITERAL); if (c == '.') { if (__isalpha (LA (1)) || LA (1) == '_' || LA (1) == '$') parser_fatal (ERR_INT_LITERAL); is_fp = true; consume_char (); continue; } if (c == 'e' || c == 'E') { if (LA (1) == '-' || LA (1) == '+') consume_char (); if (!__isdigit (LA (1))) parser_fatal (ERR_INT_LITERAL); is_exp = true; consume_char (); continue; } if (__isalpha (c) || c == '_' || c == '$') parser_fatal (ERR_INT_LITERAL); if (!__isdigit (c)) break; consume_char (); } if (is_fp || is_exp) { float res = __strtof (token_start, NULL); token_start = NULL; JERRY_UNIMPLEMENTED_REF_UNUSED_VARS (res); return empty_token; } tok_length = (size_t) (buffer - token_start);; for (i = 0; i < tok_length; i++) res = res * 10 + hex_to_int (token_start[i]); token_start = NULL; known_token = convert_seen_num_to_token (res); if (!is_empty (known_token)) return known_token; known_token = (token) { .type = TOK_INT, .data.uid = seen_nums_count }; add_num_to_seen_tokens ((num_and_token) { .num = res, .tok = known_token }); return known_token; } static char escape_char (char c) { switch (c) { case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; case '\'': case '"': case '\\': default: return c; } } static token parse_string (void) { char c = LA (0); bool is_double_quoted; char *tok = NULL; char *index = NULL; const char *i; size_t length, num; token res = empty_token; JERRY_ASSERT (c == '\'' || c == '"'); is_double_quoted = (c == '"'); // Eat up '"' consume_char (); new_token (); while (true) { c = LA (0); if (c == '\0') parser_fatal (ERR_UNCLOSED); if (c == '\n') parser_fatal (ERR_STRING); if (c == '\\') { /* Only single escape character is allowed. */ if (LA (1) == 'x' || LA (1) == 'u' || __isdigit (LA (1))) parser_fatal (ERR_STRING); if ((LA (1) == '\'' && !is_double_quoted) || (LA (1) == '"' && is_double_quoted) || LA (1) == '\n') { consume_char (); consume_char (); continue; } } else if ((c == '\'' && !is_double_quoted) || (c == '"' && is_double_quoted)) break; consume_char (); } length = (size_t) (buffer - token_start); tok = (char *) mem_heap_alloc_block (length, MEM_HEAP_ALLOC_SHORT_TERM); index = tok; for (i = token_start; i < buffer; i++) { if (*i == '\\') { if (*(i+1) == '\n') { i++; continue; } *index = escape_char (*(i+1)); index++; i++; continue; } *index = *i; index++; } __memset (index, '\0', length - (size_t) (index - tok)); token_start = NULL; // Eat up '"' consume_char (); for (num = 0; num < seen_names_count; num++) { if (!__strncmp (seen_names[num].str, tok, __strlen (tok))) { mem_heap_free_block ((uint8_t*) tok); return seen_names[num].tok; } } res = (token) { .type = TOK_STRING, .data.uid = seen_names_count }; add_name_to_seen_tokens ((string_and_token) { .str = tok, .tok = res }); return res; } static void grobble_whitespaces (void) { char c = LA (0); while ((__isspace (c) && c != '\n')) { consume_char (); c = LA (0); } } #ifdef __HOST void lexer_set_file (FILE *ex_file) { JERRY_ASSERT (ex_file); file = ex_file; lexer_debug_log = __fopen ("lexer.log", "w"); saved_token = empty_token; buffer = buffer_start = token_start = NULL; } #else void lexer_set_source (const char * source) { buffer = source; saved_token = empty_token; } #endif static bool replace_comment_by_newline (void) { char c = LA (0); bool multiline; bool was_newlines = false; JERRY_ASSERT (LA (0) == '/'); JERRY_ASSERT (LA (1) == '/' || LA (1) == '*'); multiline = (LA (1) == '*'); consume_char (); consume_char (); while (true) { c = LA (0); if (!multiline && (c == '\n' || c == '\0')) return false; if (multiline && c == '*' && LA (1) == '/') { consume_char (); consume_char (); if (was_newlines) return true; else return false; } if (multiline && c == '\n') was_newlines = true; if (multiline && c == '\0') parser_fatal (ERR_UNCLOSED); consume_char (); } } #ifdef __HOST static token lexer_next_token_private (void) #else token lexer_next_token (void) #endif { char c = LA (0); if (!is_empty (saved_token)) { token res = saved_token; saved_token = empty_token; return res; } JERRY_ASSERT (token_start == NULL); if (__isalpha (c) || c == '$' || c == '_') return parse_name (); if (__isdigit (c) || (c == '.' && __isdigit (LA (1)))) return parse_number (); if (c == '\n') { consume_char (); return (token) { .type = TOK_NEWLINE, .data.uid = 0 }; } if (c == '\0') return (token) { .type = TOK_EOF, .data.uid = 0 };; if (c == '\'' || c == '"') return parse_string (); if (__isspace (c)) { grobble_whitespaces (); return #ifdef __HOST lexer_next_token_private (); #else lexer_next_token (); #endif } if (c == '/' && LA (1) == '*') { if (replace_comment_by_newline ()) return (token) { .type = TOK_NEWLINE, .data.uid = 0 }; else return #ifdef __HOST lexer_next_token_private (); #else lexer_next_token (); #endif } if (c == '/' && LA (1) == '/') { replace_comment_by_newline ();; return #ifdef __HOST lexer_next_token_private (); #else lexer_next_token (); #endif } switch (c) { case '{': RETURN_PUNC (TOK_OPEN_BRACE); case '}': RETURN_PUNC (TOK_CLOSE_BRACE); case '(': RETURN_PUNC (TOK_OPEN_PAREN); case ')': RETURN_PUNC (TOK_CLOSE_PAREN); case '[': RETURN_PUNC (TOK_OPEN_SQUARE); case ']': RETURN_PUNC (TOK_CLOSE_SQUARE); case '.': RETURN_PUNC (TOK_DOT); case ';': RETURN_PUNC (TOK_SEMICOLON); case ',': RETURN_PUNC (TOK_COMMA); case '~': RETURN_PUNC (TOK_COMPL); case ':': RETURN_PUNC (TOK_COLON); case '?': RETURN_PUNC (TOK_QUERY); case '*': IF_LA_IS ('=', TOK_MULT_EQ, TOK_MULT); case '/': IF_LA_IS ('=', TOK_DIV_EQ, TOK_DIV); case '^': IF_LA_IS ('=', TOK_XOR_EQ, TOK_XOR); case '%': IF_LA_IS ('=', TOK_MOD_EQ, TOK_MOD); case '+': IF_LA_IS_OR ('+', TOK_DOUBLE_PLUS, '=', TOK_PLUS_EQ, TOK_PLUS); case '-': IF_LA_IS_OR ('-', TOK_DOUBLE_MINUS, '=', TOK_MINUS_EQ, TOK_MINUS); case '&': IF_LA_IS_OR ('&', TOK_DOUBLE_AND, '=', TOK_AND_EQ, TOK_AND); case '|': IF_LA_IS_OR ('|', TOK_DOUBLE_OR, '=', TOK_OR_EQ, TOK_OR); case '<': switch (LA (1)) { case '<': IF_LA_N_IS ('=', TOK_LSHIFT_EQ, TOK_LSHIFT, 2); case '=': RETURN_PUNC_EX (TOK_LESS_EQ, 2); default: RETURN_PUNC (TOK_LESS); } case '>': switch (LA (1)) { case '>': switch (LA (2)) { case '>': IF_LA_N_IS ('=', TOK_RSHIFT_EX_EQ, TOK_RSHIFT_EX, 3); case '=': RETURN_PUNC_EX (TOK_RSHIFT_EQ, 3); default: RETURN_PUNC_EX (TOK_RSHIFT, 2); } case '=': RETURN_PUNC_EX (TOK_GREATER_EQ, 2); default: RETURN_PUNC (TOK_GREATER); } case '=': if (LA (1) == '=') IF_LA_N_IS ('=', TOK_TRIPLE_EQ, TOK_DOUBLE_EQ, 2); else RETURN_PUNC (TOK_EQ); case '!': if (LA (1) == '=') IF_LA_N_IS ('=', TOK_NOT_DOUBLE_EQ, TOK_NOT_EQ, 2); else RETURN_PUNC (TOK_NOT); default: JERRY_UNREACHABLE (); } parser_fatal (ERR_NON_CHAR); } #ifdef __HOST static int i = 0; token lexer_next_token (void) { token tok = lexer_next_token_private (); if (tok.type == TOK_NEWLINE) return tok; if (tok.type == TOK_CLOSE_BRACE) { // if (i == 300) __fprintf (lexer_debug_log, "lexer_next_token(%d): type=%d, data=%d\n", i, tok.type, tok.data.uid); i++; } return tok; } #endif void lexer_save_token (token tok) { #ifdef __HOST if (tok.type == TOK_CLOSE_BRACE) __fprintf (lexer_debug_log, "lexer_save_token(%d): type=%d, data=%d\n", i, tok.type, tok.data.uid); #endif saved_token = tok; } void lexer_dump_buffer_state (void) { __printf ("%s\n", buffer); }