Add parser and compiler of regular expressions.
JerryScript-DCO-1.0-Signed-off-by: Szilard Ledan szledan.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com
This commit is contained in:
@@ -102,6 +102,7 @@ project (JerryCore CXX C ASM)
|
||||
${CMAKE_SOURCE_DIR}/jerry-core/ecma/operations
|
||||
${CMAKE_SOURCE_DIR}/jerry-core/parser/js
|
||||
${CMAKE_SOURCE_DIR}/jerry-core/parser/js/collections
|
||||
${CMAKE_SOURCE_DIR}/jerry-core/parser/regexp
|
||||
${CMAKE_SOURCE_DIR}/jerry-core/jrt)
|
||||
|
||||
# Third-party
|
||||
@@ -120,6 +121,7 @@ project (JerryCore CXX C ASM)
|
||||
file(GLOB SOURCE_CORE_ECMA_OPERATIONS ecma/operations/*.cpp)
|
||||
file(GLOB SOURCE_CORE_PARSER_JS parser/js/*.cpp)
|
||||
file(GLOB SOURCE_CORE_PARSER_JS_COLLECTIONS parser/js/collections/*.cpp)
|
||||
file(GLOB SOURCE_CORE_PARSER_REGEXP parser/regexp/*.cpp)
|
||||
file(GLOB SOURCE_CORE_JRT jrt/*.cpp)
|
||||
|
||||
set(SOURCE_CORE
|
||||
@@ -134,6 +136,7 @@ project (JerryCore CXX C ASM)
|
||||
${SOURCE_CORE_ECMA_OPERATIONS}
|
||||
${SOURCE_CORE_PARSER_JS}
|
||||
${SOURCE_CORE_PARSER_JS_COLLECTIONS}
|
||||
${SOURCE_CORE_PARSER_REGEXP}
|
||||
${SOURCE_CORE_JRT})
|
||||
|
||||
# Per-option configuration
|
||||
|
||||
@@ -0,0 +1,888 @@
|
||||
/* Copyright 2015 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecma-exceptions.h"
|
||||
#include "ecma-helpers.h"
|
||||
#include "ecma-try-catch-macro.h"
|
||||
#include "jrt-libc-includes.h"
|
||||
#include "mem-heap.h"
|
||||
#include "re-compiler.h"
|
||||
|
||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/**
|
||||
* FIXME:
|
||||
* Add comments to macro definitions in the component
|
||||
*/
|
||||
|
||||
#define REGEXP_BYTECODE_BLOCK_SIZE 256UL
|
||||
#define BYTECODE_LEN(bc_ctx_p) ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p))
|
||||
|
||||
void
|
||||
regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
|
||||
|
||||
/**
|
||||
* FIXME:
|
||||
* Add missing 're' prefixes to the component's external and internal interfaces
|
||||
*/
|
||||
|
||||
/**
|
||||
* Realloc the bytecode container
|
||||
*/
|
||||
static re_bytecode_t*
|
||||
realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
|
||||
{
|
||||
JERRY_ASSERT (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p >= 0);
|
||||
size_t old_size = static_cast<size_t> (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
|
||||
JERRY_ASSERT (!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p);
|
||||
|
||||
size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE;
|
||||
JERRY_ASSERT (bc_ctx_p->current_p - bc_ctx_p->block_start_p >= 0);
|
||||
size_t current_ptr_offset = static_cast<size_t> (bc_ctx_p->current_p - bc_ctx_p->block_start_p);
|
||||
|
||||
re_bytecode_t *new_block_start_p = (re_bytecode_t *) mem_heap_alloc_block (new_block_size,
|
||||
MEM_HEAP_ALLOC_SHORT_TERM);
|
||||
if (bc_ctx_p->current_p)
|
||||
{
|
||||
memcpy (new_block_start_p, bc_ctx_p->block_start_p, static_cast<size_t> (current_ptr_offset));
|
||||
mem_heap_free_block (bc_ctx_p->block_start_p);
|
||||
}
|
||||
bc_ctx_p->block_start_p = new_block_start_p;
|
||||
bc_ctx_p->block_end_p = new_block_start_p + new_block_size;
|
||||
bc_ctx_p->current_p = new_block_start_p + current_ptr_offset;
|
||||
|
||||
return bc_ctx_p->current_p;
|
||||
} /* realloc_regexp_bytecode_block */
|
||||
|
||||
/**
|
||||
* Append a new bytecode to the and of the bytecode container
|
||||
*/
|
||||
static void
|
||||
bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
re_bytecode_t *bytecode_p, /**< input bytecode */
|
||||
size_t length) /**< length of input */
|
||||
{
|
||||
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
|
||||
|
||||
re_bytecode_t *current_p = bc_ctx_p->current_p;
|
||||
if (current_p + length > bc_ctx_p->block_end_p)
|
||||
{
|
||||
current_p = realloc_regexp_bytecode_block (bc_ctx_p);
|
||||
}
|
||||
|
||||
memcpy (current_p, bytecode_p, length);
|
||||
bc_ctx_p->current_p += length;
|
||||
} /* bytecode_list_append */
|
||||
|
||||
/**
|
||||
* Insert a new bytecode to the bytecode container
|
||||
*/
|
||||
static void
|
||||
bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
size_t offset, /**< distance from the start of the container */
|
||||
re_bytecode_t *bytecode_p, /**< input bytecode */
|
||||
size_t length) /**< length of input */
|
||||
{
|
||||
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
|
||||
|
||||
re_bytecode_t *current_p = bc_ctx_p->current_p;
|
||||
if (current_p + length > bc_ctx_p->block_end_p)
|
||||
{
|
||||
realloc_regexp_bytecode_block (bc_ctx_p);
|
||||
}
|
||||
|
||||
re_bytecode_t *src_p = bc_ctx_p->block_start_p + offset;
|
||||
if ((BYTECODE_LEN (bc_ctx_p) - offset) > 0)
|
||||
{
|
||||
re_bytecode_t *dest_p = src_p + length;
|
||||
re_bytecode_t *tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((BYTECODE_LEN (bc_ctx_p) - offset),
|
||||
MEM_HEAP_ALLOC_SHORT_TERM);
|
||||
memcpy (tmp_block_start_p, src_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset));
|
||||
memcpy (dest_p, tmp_block_start_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset));
|
||||
mem_heap_free_block (tmp_block_start_p);
|
||||
}
|
||||
memcpy (src_p, bytecode_p, length);
|
||||
|
||||
bc_ctx_p->current_p += length;
|
||||
} /* bytecode_list_insert */
|
||||
|
||||
/**
|
||||
* Append a RegExp opcode
|
||||
*/
|
||||
static void
|
||||
append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
re_opcode_t opcode) /**< input opcode */
|
||||
{
|
||||
bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
|
||||
} /* append_opcode */
|
||||
|
||||
/**
|
||||
* Append a parameter of a RegExp opcode
|
||||
*/
|
||||
static void
|
||||
append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t));
|
||||
} /* append_u32 */
|
||||
|
||||
/**
|
||||
* Append a jump offset parameter of a RegExp opcode
|
||||
*/
|
||||
static void
|
||||
append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
value += (uint32_t) (sizeof (uint32_t));
|
||||
append_u32 (bc_ctx_p, value);
|
||||
} /* append_jump_offset */
|
||||
|
||||
/**
|
||||
* Insert a RegExp opcode
|
||||
*/
|
||||
static void
|
||||
insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t offset, /**< distance from the start of the container */
|
||||
re_opcode_t opcode) /**< input opcode */
|
||||
{
|
||||
bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
|
||||
} /* insert_opcode */
|
||||
|
||||
/**
|
||||
* Insert a parameter of a RegExp opcode
|
||||
*/
|
||||
static void
|
||||
insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
|
||||
uint32_t offset, /**< distance from the start of the container */
|
||||
uint32_t value) /**< input value */
|
||||
{
|
||||
bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t));
|
||||
} /* insert_u32 */
|
||||
|
||||
/**
|
||||
* Get a RegExp opcode
|
||||
*/
|
||||
re_opcode_t
|
||||
re_get_opcode (re_bytecode_t **bc_p) /**< pointer to bytecode start */
|
||||
{
|
||||
re_bytecode_t bytecode = **bc_p;
|
||||
(*bc_p) += sizeof (re_bytecode_t);
|
||||
return (re_opcode_t) bytecode;
|
||||
} /* get_opcode */
|
||||
|
||||
/**
|
||||
* Get a parameter of a RegExp opcode
|
||||
*/
|
||||
uint32_t
|
||||
re_get_value (re_bytecode_t **bc_p) /**< pointer to bytecode start */
|
||||
{
|
||||
uint32_t value = *((uint32_t*) *bc_p);
|
||||
(*bc_p) += sizeof (uint32_t);
|
||||
return value;
|
||||
} /* get_value */
|
||||
|
||||
/**
|
||||
* Callback function of character class generation
|
||||
*/
|
||||
static void
|
||||
append_char_class (void* re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t start, /**< character class range from */
|
||||
uint32_t end) /**< character class range to */
|
||||
{
|
||||
/* FIXME: Handle ignore case flag and add unicode support. */
|
||||
re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t*) re_ctx_p;
|
||||
append_u32 (ctx_p->bytecode_ctx_p, start);
|
||||
append_u32 (ctx_p->bytecode_ctx_p, end);
|
||||
ctx_p->parser_ctx_p->num_of_classes++;
|
||||
} /* append_char_class */
|
||||
|
||||
/**
|
||||
* Insert simple atom iterator
|
||||
*/
|
||||
static void
|
||||
insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t new_atom_start_offset) /**< atom start offset */
|
||||
{
|
||||
uint32_t atom_code_length;
|
||||
uint32_t offset;
|
||||
uint32_t qmin, qmax;
|
||||
|
||||
qmin = re_ctx_p->current_token.qmin;
|
||||
qmax = re_ctx_p->current_token.qmax;
|
||||
JERRY_ASSERT (qmin <= qmax);
|
||||
|
||||
/* FIXME: optimize bytecode length. Store 0 rather than INF */
|
||||
|
||||
append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */
|
||||
uint32_t bytecode_length = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset);
|
||||
|
||||
offset = new_atom_start_offset;
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length);
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax);
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin);
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR);
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR);
|
||||
}
|
||||
} /* insert_simple_iterator */
|
||||
|
||||
/**
|
||||
* Get the type of a group start
|
||||
*/
|
||||
static re_opcode_t
|
||||
get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool is_capturable) /**< is capturabel group */
|
||||
{
|
||||
if (is_capturable)
|
||||
{
|
||||
if (re_ctx_p->current_token.qmin == 0)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_GROUP_START;
|
||||
}
|
||||
|
||||
if (re_ctx_p->current_token.qmin == 0)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_GROUP_START;
|
||||
|
||||
JERRY_UNREACHABLE ();
|
||||
return 0;
|
||||
} /* get_start_opcode_type */
|
||||
|
||||
/**
|
||||
* Get the type of a group end
|
||||
*/
|
||||
static re_opcode_t
|
||||
get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool is_capturable) /**< is capturabel group */
|
||||
{
|
||||
if (is_capturable)
|
||||
{
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_CAPTURE_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
return RE_OP_CAPTURE_NON_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
if (re_ctx_p->current_token.greedy)
|
||||
{
|
||||
return RE_OP_NON_CAPTURE_GREEDY_GROUP_END;
|
||||
}
|
||||
|
||||
return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END;
|
||||
|
||||
JERRY_UNREACHABLE ();
|
||||
return 0;
|
||||
} /* get_end_opcode_type */
|
||||
|
||||
/**
|
||||
* Enclose the given bytecode to a group
|
||||
*/
|
||||
static void
|
||||
insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t group_start_offset, /**< offset of group start */
|
||||
uint32_t idx, /**< index of group */
|
||||
bool is_capturable) /**< is capturabel group */
|
||||
{
|
||||
uint32_t qmin, qmax;
|
||||
re_opcode_t start_opcode = get_start_opcode_type (re_ctx_p, is_capturable);
|
||||
re_opcode_t end_opcode = get_end_opcode_type (re_ctx_p, is_capturable);
|
||||
uint32_t start_head_offset_len;
|
||||
|
||||
qmin = re_ctx_p->current_token.qmin;
|
||||
qmax = re_ctx_p->current_token.qmax;
|
||||
JERRY_ASSERT (qmin <= qmax);
|
||||
|
||||
start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx);
|
||||
insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode);
|
||||
start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - start_head_offset_len;
|
||||
append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode);
|
||||
append_u32 (re_ctx_p->bytecode_ctx_p, idx);
|
||||
append_u32 (re_ctx_p->bytecode_ctx_p, qmin);
|
||||
append_u32 (re_ctx_p->bytecode_ctx_p, qmax);
|
||||
|
||||
group_start_offset += start_head_offset_len;
|
||||
append_jump_offset (re_ctx_p->bytecode_ctx_p,
|
||||
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
|
||||
if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START)
|
||||
{
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p,
|
||||
group_start_offset,
|
||||
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
}
|
||||
} /* insert_into_group */
|
||||
|
||||
/**
|
||||
* Enclose the given bytecode to a group and inster jump value
|
||||
*/
|
||||
static void
|
||||
insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
uint32_t group_start_offset, /**< offset of group start */
|
||||
uint32_t idx, /**< index of group */
|
||||
bool is_capturable) /**< is capturabel group */
|
||||
{
|
||||
insert_u32 (re_ctx_p->bytecode_ctx_p,
|
||||
group_start_offset,
|
||||
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
|
||||
insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
|
||||
} /* insert_into_group_with_jump */
|
||||
|
||||
/**
|
||||
* Parse alternatives
|
||||
*
|
||||
* @return completion value
|
||||
* Returned value must be freed with ecma_free_completion_value
|
||||
*/
|
||||
static ecma_completion_value_t
|
||||
parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
|
||||
bool expect_eof) /**< expect end of file */
|
||||
{
|
||||
uint32_t idx;
|
||||
re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p;
|
||||
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
|
||||
|
||||
uint32_t alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
|
||||
if (re_ctx_p->recursion_depth >= RE_COMPILE_RECURSION_LIMIT)
|
||||
{
|
||||
ret_value = ecma_raise_range_error ((const ecma_char_t *) "RegExp compiler recursion limit is exceeded.");
|
||||
return ret_value;
|
||||
}
|
||||
re_ctx_p->recursion_depth++;
|
||||
|
||||
while (true)
|
||||
{
|
||||
ECMA_TRY_CATCH (empty,
|
||||
re_parse_next_token (re_ctx_p->parser_ctx_p,
|
||||
&(re_ctx_p->current_token)),
|
||||
ret_value);
|
||||
ECMA_FINALIZE (empty);
|
||||
if (!ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
return ret_value; /* error */
|
||||
}
|
||||
uint32_t new_atom_start_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
|
||||
switch (re_ctx_p->current_token.type)
|
||||
{
|
||||
case RE_TOK_START_CAPTURE_GROUP:
|
||||
{
|
||||
idx = re_ctx_p->num_of_captures++;
|
||||
JERRY_DDLOG ("Compile a capture group start (idx: %d)\n", idx);
|
||||
|
||||
ret_value = parse_alternative (re_ctx_p, false);
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
insert_into_group (re_ctx_p, new_atom_start_offset, idx, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ret_value; /* error */
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_START_NON_CAPTURE_GROUP:
|
||||
{
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
JERRY_DDLOG ("Compile a non-capture group start (idx: %d)\n", idx);
|
||||
|
||||
ret_value = parse_alternative (re_ctx_p, false);
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
insert_into_group (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ret_value; /* error */
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_CHAR:
|
||||
{
|
||||
JERRY_DDLOG ("Compile character token: %c, qmin: %d, qmax: %d\n",
|
||||
re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax);
|
||||
|
||||
append_opcode (bc_ctx_p, RE_OP_CHAR);
|
||||
append_u32 (bc_ctx_p, re_ctx_p->current_token.value);
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_PERIOD:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a period\n");
|
||||
append_opcode (bc_ctx_p, RE_OP_PERIOD);
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ALTERNATIVE:
|
||||
{
|
||||
JERRY_DDLOG ("Compile an alternative\n");
|
||||
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
|
||||
append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE);
|
||||
alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a start assertion\n");
|
||||
append_opcode (bc_ctx_p, RE_OP_ASSERT_START);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_END:
|
||||
{
|
||||
JERRY_DDLOG ("Compile an end assertion\n");
|
||||
append_opcode (bc_ctx_p, RE_OP_ASSERT_END);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a word boundary assertion\n");
|
||||
append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_NOT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a not word boundary assertion\n");
|
||||
append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START_POS_LOOKAHEAD:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a positive lookahead assertion\n");
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS);
|
||||
|
||||
ret_value = parse_alternative (re_ctx_p, false);
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
append_opcode (bc_ctx_p, RE_OP_MATCH);
|
||||
|
||||
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ret_value; /* error */
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_ASSERT_START_NEG_LOOKAHEAD:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a negative lookahead assertion\n");
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG);
|
||||
|
||||
ret_value = parse_alternative (re_ctx_p, false);
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
append_opcode (bc_ctx_p, RE_OP_MATCH);
|
||||
|
||||
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ret_value; /* error */
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RE_TOK_BACKREFERENCE:
|
||||
{
|
||||
uint32_t backref = (uint32_t) re_ctx_p->current_token.value;
|
||||
idx = re_ctx_p->num_of_non_captures++;
|
||||
if (backref > re_ctx_p->highest_backref)
|
||||
{
|
||||
re_ctx_p->highest_backref = backref;
|
||||
}
|
||||
JERRY_DDLOG ("Compile a backreference: %d\n", backref);
|
||||
append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE);
|
||||
append_u32 (bc_ctx_p, backref);
|
||||
|
||||
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_START_CHAR_CLASS:
|
||||
case RE_TOK_START_INV_CHAR_CLASS:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a character class\n");
|
||||
append_opcode (bc_ctx_p,
|
||||
re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS
|
||||
? RE_OP_CHAR_CLASS
|
||||
: RE_OP_INV_CHAR_CLASS);
|
||||
uint32_t offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
|
||||
|
||||
ECMA_TRY_CATCH (empty,
|
||||
re_parse_char_class (re_ctx_p->parser_ctx_p,
|
||||
append_char_class,
|
||||
re_ctx_p,
|
||||
&(re_ctx_p->current_token)),
|
||||
ret_value);
|
||||
insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes);
|
||||
|
||||
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
|
||||
{
|
||||
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
|
||||
}
|
||||
ECMA_FINALIZE (empty);
|
||||
break;
|
||||
}
|
||||
case RE_TOK_END_GROUP:
|
||||
{
|
||||
JERRY_DDLOG ("Compile a group end\n");
|
||||
|
||||
if (expect_eof)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of paren.");
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
|
||||
re_ctx_p->recursion_depth--;
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
}
|
||||
case RE_TOK_EOF:
|
||||
{
|
||||
if (!expect_eof)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of pattern.");
|
||||
}
|
||||
else
|
||||
{
|
||||
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
|
||||
re_ctx_p->recursion_depth--;
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
}
|
||||
default:
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected RegExp token.");
|
||||
return ret_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
JERRY_UNREACHABLE ();
|
||||
return ret_value;
|
||||
} /* parse_alternative */
|
||||
|
||||
/**
|
||||
* Compilation of RegExp bytecode
|
||||
*
|
||||
* @return completion value
|
||||
* Returned value must be freed with ecma_free_completion_value
|
||||
*/
|
||||
ecma_completion_value_t
|
||||
re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
|
||||
ecma_string_t *pattern_str_p, /**< pattern */
|
||||
uint8_t flags) /**< flags */
|
||||
{
|
||||
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
|
||||
re_compiler_ctx_t re_ctx;
|
||||
re_ctx.flags = flags;
|
||||
re_ctx.highest_backref = 0;
|
||||
re_ctx.num_of_non_captures = 0;
|
||||
re_ctx.recursion_depth = 0;
|
||||
|
||||
re_bytecode_ctx_t bc_ctx;
|
||||
bc_ctx.block_start_p = NULL;
|
||||
bc_ctx.block_end_p = NULL;
|
||||
bc_ctx.current_p = NULL;
|
||||
|
||||
re_ctx.bytecode_ctx_p = &bc_ctx;
|
||||
|
||||
int32_t pattern_str_len = ecma_string_get_length (pattern_str_p);
|
||||
MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_len + 1, ecma_char_t);
|
||||
ssize_t zt_str_size = (ssize_t) sizeof (ecma_char_t) * (pattern_str_len + 1);
|
||||
ecma_string_to_zt_string (pattern_str_p, pattern_start_p, zt_str_size);
|
||||
|
||||
re_parser_ctx_t parser_ctx;
|
||||
parser_ctx.pattern_start_p = pattern_start_p;
|
||||
parser_ctx.current_char_p = pattern_start_p;
|
||||
parser_ctx.num_of_groups = -1;
|
||||
re_ctx.parser_ctx_p = &parser_ctx;
|
||||
|
||||
/* 1. Parse RegExp pattern */
|
||||
re_ctx.num_of_captures = 1;
|
||||
append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
|
||||
|
||||
ECMA_TRY_CATCH (empty, parse_alternative (&re_ctx, true), ret_value);
|
||||
|
||||
/* 2. Check for invalid backreference */
|
||||
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Invalid backreference.\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
|
||||
append_opcode (&bc_ctx, RE_OP_EOF);
|
||||
|
||||
/* 3. Insert extra informations for bytecode header */
|
||||
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures);
|
||||
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2);
|
||||
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags);
|
||||
}
|
||||
ECMA_FINALIZE (empty);
|
||||
|
||||
/* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */
|
||||
JERRY_ASSERT (bc_ctx.block_start_p != NULL);
|
||||
ECMA_SET_POINTER (bytecode_p->u.internal_property.value, bc_ctx.block_start_p);
|
||||
|
||||
MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p);
|
||||
|
||||
#ifdef JERRY_ENABLE_LOG
|
||||
regexp_dump_bytecode (&bc_ctx);
|
||||
#endif
|
||||
|
||||
return ret_value;
|
||||
} /* re_compile_bytecode */
|
||||
|
||||
#ifdef JERRY_ENABLE_LOG
|
||||
/**
|
||||
* RegExp bytecode dumper
|
||||
*/
|
||||
void
|
||||
regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p)
|
||||
{
|
||||
re_bytecode_t *bytecode_p = bc_ctx_p->block_start_p;
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d | ", re_get_value (&bytecode_p));
|
||||
|
||||
re_opcode_t op;
|
||||
while ((op = re_get_opcode (&bytecode_p)))
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case RE_OP_MATCH:
|
||||
{
|
||||
JERRY_DLOG ("MATCH, ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_CHAR:
|
||||
{
|
||||
JERRY_DLOG ("CHAR ");
|
||||
JERRY_DLOG ("%c, ", (char) re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("GZ_START ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("START ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DLOG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CAPTURE_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DLOG ("G_END ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("GZ_NC_START ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GROUP_START:
|
||||
{
|
||||
JERRY_DLOG ("NC_START ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DLOG ("N");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
|
||||
{
|
||||
JERRY_DLOG ("G_NC_END ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_SAVE_AT_START:
|
||||
{
|
||||
JERRY_DLOG ("RE_START ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_SAVE_AND_MATCH:
|
||||
{
|
||||
JERRY_DLOG ("RE_END, ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_GREEDY_ITERATOR:
|
||||
{
|
||||
JERRY_DLOG ("GREEDY_ITERATOR ");
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_NON_GREEDY_ITERATOR:
|
||||
{
|
||||
JERRY_DLOG ("NON_GREEDY_ITERATOR ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_PERIOD:
|
||||
{
|
||||
JERRY_DLOG ("PERIOD ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ALTERNATIVE:
|
||||
{
|
||||
JERRY_DLOG ("ALTERNATIVE ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_START:
|
||||
{
|
||||
JERRY_DLOG ("ASSERT_START ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_END:
|
||||
{
|
||||
JERRY_DLOG ("ASSERT_END ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DLOG ("ASSERT_WORD_BOUNDARY ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
|
||||
{
|
||||
JERRY_DLOG ("ASSERT_NOT_WORD_BOUNDARY ");
|
||||
break;
|
||||
}
|
||||
case RE_OP_LOOKAHEAD_POS:
|
||||
{
|
||||
JERRY_DLOG ("LOOKAHEAD_POS ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_LOOKAHEAD_NEG:
|
||||
{
|
||||
JERRY_DLOG ("LOOKAHEAD_NEG ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_BACKREFERENCE:
|
||||
{
|
||||
JERRY_DLOG ("BACKREFERENCE ");
|
||||
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
|
||||
break;
|
||||
}
|
||||
case RE_OP_INV_CHAR_CLASS:
|
||||
{
|
||||
JERRY_DLOG ("INV_");
|
||||
/* FALLTHRU */
|
||||
}
|
||||
case RE_OP_CHAR_CLASS:
|
||||
{
|
||||
JERRY_DLOG ("CHAR_CLASS ");
|
||||
uint32_t num_of_class = re_get_value (&bytecode_p);
|
||||
JERRY_DLOG ("%d", num_of_class);
|
||||
while (num_of_class)
|
||||
{
|
||||
JERRY_DLOG (" %d", re_get_value (&bytecode_p));
|
||||
JERRY_DLOG ("-%d", re_get_value (&bytecode_p));
|
||||
num_of_class--;
|
||||
}
|
||||
JERRY_DLOG (", ");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
JERRY_DLOG ("UNKNOWN(%d), ", (uint32_t) op);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
JERRY_DLOG ("EOF\n");
|
||||
} /* regexp_dump_bytecode */
|
||||
#endif /* JERRY_ENABLE_LOG */
|
||||
|
||||
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
|
||||
@@ -0,0 +1,108 @@
|
||||
/* Copyright 2015 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef RE_COMPILER_H
|
||||
#define RE_COMPILER_H
|
||||
|
||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
#include "ecma-globals.h"
|
||||
#include "re-parser.h"
|
||||
|
||||
/* RegExp opcodes
|
||||
* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
|
||||
* Change it carfully. Capture opcodes should be at first.
|
||||
*/
|
||||
#define RE_OP_EOF 0
|
||||
|
||||
#define RE_OP_CAPTURE_GROUP_START 1
|
||||
#define RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START 2
|
||||
#define RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START 3
|
||||
#define RE_OP_CAPTURE_GREEDY_GROUP_END 4
|
||||
#define RE_OP_CAPTURE_NON_GREEDY_GROUP_END 5
|
||||
#define RE_OP_NON_CAPTURE_GROUP_START 6
|
||||
#define RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START 7
|
||||
#define RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START 8
|
||||
#define RE_OP_NON_CAPTURE_GREEDY_GROUP_END 9
|
||||
#define RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END 10
|
||||
|
||||
#define RE_OP_MATCH 11
|
||||
#define RE_OP_CHAR 12
|
||||
#define RE_OP_SAVE_AT_START 13
|
||||
#define RE_OP_SAVE_AND_MATCH 14
|
||||
#define RE_OP_PERIOD 15
|
||||
#define RE_OP_ALTERNATIVE 16
|
||||
#define RE_OP_GREEDY_ITERATOR 17
|
||||
#define RE_OP_NON_GREEDY_ITERATOR 18
|
||||
#define RE_OP_ASSERT_START 19
|
||||
#define RE_OP_ASSERT_END 20
|
||||
#define RE_OP_ASSERT_WORD_BOUNDARY 21
|
||||
#define RE_OP_ASSERT_NOT_WORD_BOUNDARY 22
|
||||
#define RE_OP_LOOKAHEAD_POS 23
|
||||
#define RE_OP_LOOKAHEAD_NEG 24
|
||||
#define RE_OP_BACKREFERENCE 25
|
||||
#define RE_OP_CHAR_CLASS 26
|
||||
#define RE_OP_INV_CHAR_CLASS 27
|
||||
|
||||
#define RE_COMPILE_RECURSION_LIMIT 100
|
||||
|
||||
#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
|
||||
|
||||
typedef uint8_t re_opcode_t; /* type of RegExp opcodes */
|
||||
typedef uint8_t re_bytecode_t; /* type of standard bytecode elements (ex.: opcode parameters) */
|
||||
|
||||
/**
|
||||
* Context of RegExp bytecode container
|
||||
*
|
||||
* FIXME:
|
||||
* Add comments with description of the structure members
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
re_bytecode_t *block_start_p;
|
||||
re_bytecode_t *block_end_p;
|
||||
re_bytecode_t *current_p;
|
||||
} re_bytecode_ctx_t;
|
||||
|
||||
/**
|
||||
* Context of RegExp compiler
|
||||
*
|
||||
* FIXME:
|
||||
* Add comments with description of the structure members
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
uint8_t flags;
|
||||
uint32_t recursion_depth;
|
||||
uint32_t num_of_captures;
|
||||
uint32_t num_of_non_captures;
|
||||
uint32_t highest_backref;
|
||||
re_bytecode_ctx_t *bytecode_ctx_p;
|
||||
re_token_t current_token;
|
||||
re_parser_ctx_t *parser_ctx_p;
|
||||
} re_compiler_ctx_t;
|
||||
|
||||
ecma_completion_value_t
|
||||
re_compile_bytecode (ecma_property_t *bytecode_p, ecma_string_t *pattern_str_p, uint8_t flags);
|
||||
|
||||
re_opcode_t
|
||||
re_get_opcode (re_bytecode_t **bc_p);
|
||||
|
||||
uint32_t
|
||||
re_get_value (re_bytecode_t **bc_p);
|
||||
|
||||
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
|
||||
#endif /* RE_COMPILER_H */
|
||||
@@ -0,0 +1,808 @@
|
||||
/* Copyright 2015 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "ecma-exceptions.h"
|
||||
#include "ecma-globals.h"
|
||||
#include "ecma-helpers.h"
|
||||
#include "ecma-try-catch-macro.h"
|
||||
#include "jrt-libc-includes.h"
|
||||
#include "re-parser.h"
|
||||
#include "syntax-errors.h"
|
||||
|
||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
/* FIXME: change it, when unicode support would be implemented */
|
||||
#define RE_LOOKUP(str_p, lookup) (ecma_zt_string_length (str_p) > lookup ? str_p[lookup] : '\0')
|
||||
|
||||
/* FIXME: change it, when unicode support would be implemented */
|
||||
#define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0)
|
||||
|
||||
static ecma_char_t
|
||||
get_ecma_char (ecma_char_t** char_p)
|
||||
{
|
||||
/* FIXME: change to string iterator with unicode support, when it would be implemented */
|
||||
ecma_char_t ch = **char_p;
|
||||
RE_ADVANCE (*char_p, 1);
|
||||
return ch;
|
||||
} /* get_ecma_char */
|
||||
|
||||
/**
|
||||
* Parse RegExp iterators
|
||||
*
|
||||
* @return completion value
|
||||
* Returned value must be freed with ecma_free_completion_value
|
||||
*/
|
||||
static ecma_completion_value_t
|
||||
parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */
|
||||
re_token_t *re_token_p, /**< output token */
|
||||
uint32_t lookup, /**< size of lookup */
|
||||
uint32_t *advance_p) /**< output length of current advance */
|
||||
{
|
||||
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
|
||||
|
||||
ecma_char_t ch0 = RE_LOOKUP (pattern_p, lookup);
|
||||
ecma_char_t ch1 = RE_LOOKUP (pattern_p, lookup + 1);
|
||||
|
||||
switch (ch0)
|
||||
{
|
||||
case '?':
|
||||
{
|
||||
re_token_p->qmin = 0;
|
||||
re_token_p->qmax = 1;
|
||||
if (ch1 == '?')
|
||||
{
|
||||
*advance_p = 2;
|
||||
re_token_p->greedy = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
*advance_p = 1;
|
||||
re_token_p->greedy = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '*':
|
||||
{
|
||||
re_token_p->qmin = 0;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
if (ch1 == '?')
|
||||
{
|
||||
*advance_p = 2;
|
||||
re_token_p->greedy = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
*advance_p = 1;
|
||||
re_token_p->greedy = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '+':
|
||||
{
|
||||
re_token_p->qmin = 1;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
if (ch1 == '?')
|
||||
{
|
||||
*advance_p = 2;
|
||||
re_token_p->greedy = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
*advance_p = 1;
|
||||
re_token_p->greedy = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '{':
|
||||
{
|
||||
uint32_t qmin = 0;
|
||||
uint32_t qmax = RE_ITERATOR_INFINITE;
|
||||
uint32_t digits = 0;
|
||||
while (true)
|
||||
{
|
||||
(*advance_p)++;
|
||||
ch1 = RE_LOOKUP (pattern_p, lookup + *advance_p);
|
||||
|
||||
if (isdigit (ch1))
|
||||
{
|
||||
if (digits >= ECMA_NUMBER_MAX_DIGITS)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: too many digits.");
|
||||
return ret_value;
|
||||
}
|
||||
digits++;
|
||||
qmin = qmin * 10 + ecma_char_hex_to_int (ch1);
|
||||
}
|
||||
else if (ch1 == ',')
|
||||
{
|
||||
if (qmax != RE_ITERATOR_INFINITE)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: double comma.");
|
||||
return ret_value;
|
||||
}
|
||||
if ((RE_LOOKUP (pattern_p, lookup + *advance_p + 1)) == '}')
|
||||
{
|
||||
if (digits == 0)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits.");
|
||||
return ret_value;
|
||||
}
|
||||
|
||||
re_token_p->qmin = qmin;
|
||||
re_token_p->qmax = RE_ITERATOR_INFINITE;
|
||||
*advance_p += 2;
|
||||
break;
|
||||
}
|
||||
qmax = qmin;
|
||||
qmin = 0;
|
||||
digits = 0;
|
||||
}
|
||||
else if (ch1 == '}')
|
||||
{
|
||||
if (digits == 0)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits.");
|
||||
return ret_value;
|
||||
}
|
||||
|
||||
if (qmax != RE_ITERATOR_INFINITE)
|
||||
{
|
||||
re_token_p->qmin = qmax;
|
||||
re_token_p->qmax = qmin;
|
||||
}
|
||||
else
|
||||
{
|
||||
re_token_p->qmin = qmin;
|
||||
re_token_p->qmax = qmin;
|
||||
}
|
||||
|
||||
*advance_p += 1;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: unknown char.");
|
||||
return ret_value;
|
||||
}
|
||||
}
|
||||
|
||||
if ((RE_LOOKUP (pattern_p, lookup + *advance_p)) == '?')
|
||||
{
|
||||
re_token_p->greedy = false;
|
||||
*advance_p += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
re_token_p->greedy = true;
|
||||
}
|
||||
break;
|
||||
|
||||
JERRY_UNREACHABLE ();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
re_token_p->qmin = 1;
|
||||
re_token_p->qmax = 1;
|
||||
re_token_p->greedy = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
JERRY_ASSERT (ecma_is_completion_value_empty (ret_value));
|
||||
|
||||
if (re_token_p->qmin > re_token_p->qmax)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: qmin > qmax.");
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* parse_re_iterator */
|
||||
|
||||
/**
|
||||
* Count the number of groups in pattern
|
||||
*/
|
||||
static void
|
||||
re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
|
||||
{
|
||||
ecma_char_t *pattern_p = parser_ctx_p->pattern_start_p;
|
||||
ecma_char_t ch1;
|
||||
int char_class_in = 0;
|
||||
parser_ctx_p->num_of_groups = 0;
|
||||
|
||||
ch1 = get_ecma_char (&pattern_p);
|
||||
while (ch1 != '\0')
|
||||
{
|
||||
ecma_char_t ch0 = ch1;
|
||||
ch1 = get_ecma_char (&pattern_p);
|
||||
switch (ch0)
|
||||
{
|
||||
case '\\':
|
||||
{
|
||||
ch1 = get_ecma_char (&pattern_p);
|
||||
break;
|
||||
}
|
||||
case '[':
|
||||
{
|
||||
char_class_in++;
|
||||
break;
|
||||
}
|
||||
case ']':
|
||||
{
|
||||
if (!char_class_in)
|
||||
{
|
||||
char_class_in--;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '(':
|
||||
{
|
||||
if (ch1 != '?' && !char_class_in)
|
||||
{
|
||||
parser_ctx_p->num_of_groups++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} /* re_count_num_of_groups */
|
||||
|
||||
/**
|
||||
* Read the input pattern and parse the range of character class
|
||||
*
|
||||
* @return completion value
|
||||
* Returned value must be freed with ecma_free_completion_value
|
||||
*/
|
||||
ecma_completion_value_t
|
||||
re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
|
||||
re_char_class_callback append_char_class, /**< callback function,
|
||||
* which adds the char-ranges
|
||||
* to the bytecode */
|
||||
void* re_ctx_p, /**< regexp compiler context */
|
||||
re_token_t *out_token_p) /**< output token */
|
||||
{
|
||||
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
|
||||
ecma_char_t **pattern_p = &(parser_ctx_p->current_char_p);
|
||||
|
||||
out_token_p->qmax = out_token_p->qmin = 1;
|
||||
ecma_char_t start = RE_CHAR_UNDEF;
|
||||
bool is_range = false;
|
||||
parser_ctx_p->num_of_classes = 0;
|
||||
|
||||
do
|
||||
{
|
||||
ecma_char_t ch = get_ecma_char (pattern_p);
|
||||
if (ch == ']')
|
||||
{
|
||||
if (start != RE_CHAR_UNDEF)
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (ch == '-')
|
||||
{
|
||||
if (start != RE_CHAR_UNDEF && !is_range && RE_LOOKUP (*pattern_p, 0) != ']')
|
||||
{
|
||||
is_range = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (ch == '\\')
|
||||
{
|
||||
ch = get_ecma_char (pattern_p);
|
||||
|
||||
if (ch == 'b')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_BEL;
|
||||
}
|
||||
else if (ch == 'f')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_FF;
|
||||
}
|
||||
else if (ch == 'n')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_EOL;
|
||||
}
|
||||
else if (ch == 't')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_TAB;
|
||||
}
|
||||
else if (ch == 'r')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_CR;
|
||||
}
|
||||
else if (ch == 'v')
|
||||
{
|
||||
ch = RE_CONTROL_CHAR_VT;
|
||||
}
|
||||
else if (ch == 'c')
|
||||
{
|
||||
ch = get_ecma_char (pattern_p);
|
||||
if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
|
||||
{
|
||||
ch = (ch % 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape");
|
||||
return ret_value;
|
||||
}
|
||||
}
|
||||
else if (ch == 'x')
|
||||
{
|
||||
/* FIXME: get unicode char from hex-digits */
|
||||
/* ch = ...; */
|
||||
}
|
||||
else if (ch == 'u')
|
||||
{
|
||||
/* FIXME: get unicode char from digits */
|
||||
/* ch = ...; */
|
||||
}
|
||||
else if (ch == 'd')
|
||||
{
|
||||
/* append digits from '0' to '9'. */
|
||||
append_char_class (re_ctx_p, 0x0030UL, 0x0039UL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == 'D')
|
||||
{
|
||||
append_char_class (re_ctx_p, 0x0000UL, 0x002FUL);
|
||||
append_char_class (re_ctx_p, 0x003AUL, 0xFFFFUL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == 's')
|
||||
{
|
||||
append_char_class (re_ctx_p, 0x0009UL, 0x000DUL);
|
||||
append_char_class (re_ctx_p, 0x0020UL, 0x0020UL);
|
||||
append_char_class (re_ctx_p, 0x00A0UL, 0x00A0UL);
|
||||
append_char_class (re_ctx_p, 0x1680UL, 0x1680UL);
|
||||
append_char_class (re_ctx_p, 0x180EUL, 0x180EUL);
|
||||
append_char_class (re_ctx_p, 0x2000UL, 0x200AUL);
|
||||
append_char_class (re_ctx_p, 0x2028UL, 0x2029UL);
|
||||
append_char_class (re_ctx_p, 0x202FUL, 0x202FUL);
|
||||
append_char_class (re_ctx_p, 0x205FUL, 0x205FUL);
|
||||
append_char_class (re_ctx_p, 0x3000UL, 0x3000UL);
|
||||
append_char_class (re_ctx_p, 0xFEFFUL, 0xFEFFUL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == 'S')
|
||||
{
|
||||
append_char_class (re_ctx_p, 0x0000UL, 0x0008UL);
|
||||
append_char_class (re_ctx_p, 0x000EUL, 0x001FUL);
|
||||
append_char_class (re_ctx_p, 0x0021UL, 0x009FUL);
|
||||
append_char_class (re_ctx_p, 0x00A1UL, 0x167FUL);
|
||||
append_char_class (re_ctx_p, 0x1681UL, 0x180DUL);
|
||||
append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL);
|
||||
append_char_class (re_ctx_p, 0x200BUL, 0x2027UL);
|
||||
append_char_class (re_ctx_p, 0x202AUL, 0x202EUL);
|
||||
append_char_class (re_ctx_p, 0x2030UL, 0x205EUL);
|
||||
append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL);
|
||||
append_char_class (re_ctx_p, 0x3001UL, 0xFEFEUL);
|
||||
append_char_class (re_ctx_p, 0xFF00UL, 0xFFFFUL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == 'w')
|
||||
{
|
||||
append_char_class (re_ctx_p, 0x0030UL, 0x0039UL);
|
||||
append_char_class (re_ctx_p, 0x0041UL, 0x005AUL);
|
||||
append_char_class (re_ctx_p, 0x005FUL, 0x005FUL);
|
||||
append_char_class (re_ctx_p, 0x0061UL, 0x007AUL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (ch == 'W')
|
||||
{
|
||||
append_char_class (re_ctx_p, 0x0000UL, 0x002FUL);
|
||||
append_char_class (re_ctx_p, 0x003AUL, 0x0040UL);
|
||||
append_char_class (re_ctx_p, 0x005BUL, 0x005EUL);
|
||||
append_char_class (re_ctx_p, 0x0060UL, 0x0060UL);
|
||||
append_char_class (re_ctx_p, 0x007BUL, 0xFFFFUL);
|
||||
ch = RE_CHAR_UNDEF;
|
||||
}
|
||||
else if (isdigit (ch))
|
||||
{
|
||||
if (ch != '\0' || isdigit (RE_LOOKUP (*pattern_p, 1)))
|
||||
{
|
||||
/* FIXME: octal support */
|
||||
}
|
||||
}
|
||||
/* FIXME: depends on the unicode support
|
||||
else if (!jerry_unicode_identifier (ch))
|
||||
{
|
||||
JERRY_ERROR_MSG ("RegExp escape pattern error. (Char class)");
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
if (ch == RE_CHAR_UNDEF)
|
||||
{
|
||||
if (start != RE_CHAR_UNDEF)
|
||||
{
|
||||
if (is_range)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range");
|
||||
return ret_value;
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
start = RE_CHAR_UNDEF;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (start != RE_CHAR_UNDEF)
|
||||
{
|
||||
if (is_range)
|
||||
{
|
||||
if (start > ch)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range");
|
||||
return ret_value;
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, ch);
|
||||
start = RE_CHAR_UNDEF;
|
||||
is_range = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
append_char_class (re_ctx_p, start, start);
|
||||
start = ch;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
start = ch;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (true);
|
||||
|
||||
uint32_t advance = 0;
|
||||
ECMA_TRY_CATCH (empty,
|
||||
parse_re_iterator (parser_ctx_p->current_char_p,
|
||||
out_token_p,
|
||||
0,
|
||||
&advance),
|
||||
ret_value);
|
||||
RE_ADVANCE (parser_ctx_p->current_char_p, advance);
|
||||
ECMA_FINALIZE (empty);
|
||||
|
||||
return ret_value;
|
||||
} /* re_parse_char_class */
|
||||
|
||||
/**
|
||||
* Read the input pattern and parse the next token for the RegExp compiler
|
||||
*
|
||||
* @return completion value
|
||||
* Returned value must be freed with ecma_free_completion_value
|
||||
*/
|
||||
ecma_completion_value_t
|
||||
re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */
|
||||
re_token_t *out_token_p) /**< output token */
|
||||
{
|
||||
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
|
||||
uint32_t advance = 0;
|
||||
ecma_char_t ch0 = *(parser_ctx_p->current_char_p);
|
||||
|
||||
switch (ch0)
|
||||
{
|
||||
case '|':
|
||||
{
|
||||
advance = 1;
|
||||
out_token_p->type = RE_TOK_ALTERNATIVE;
|
||||
break;
|
||||
}
|
||||
case '^':
|
||||
{
|
||||
advance = 1;
|
||||
out_token_p->type = RE_TOK_ASSERT_START;
|
||||
break;
|
||||
}
|
||||
case '$':
|
||||
{
|
||||
advance = 1;
|
||||
out_token_p->type = RE_TOK_ASSERT_END;
|
||||
break;
|
||||
}
|
||||
case '.':
|
||||
{
|
||||
ECMA_TRY_CATCH (empty,
|
||||
parse_re_iterator (parser_ctx_p->current_char_p,
|
||||
out_token_p,
|
||||
1,
|
||||
&advance),
|
||||
ret_value);
|
||||
advance += 1;
|
||||
out_token_p->type = RE_TOK_PERIOD;
|
||||
ECMA_FINALIZE (empty);
|
||||
break;
|
||||
}
|
||||
case '\\':
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
ecma_char_t ch1 = RE_LOOKUP (parser_ctx_p->current_char_p, 1);
|
||||
|
||||
if (ch1 == 'b')
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY;
|
||||
}
|
||||
else if (ch1 == 'B')
|
||||
{
|
||||
out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY;
|
||||
}
|
||||
else if (ch1 == 'f')
|
||||
{
|
||||
out_token_p->value = RE_CONTROL_CHAR_FF;
|
||||
}
|
||||
else if (ch1 == 'n')
|
||||
{
|
||||
out_token_p->value = RE_CONTROL_CHAR_EOL;
|
||||
}
|
||||
else if (ch1 == 't')
|
||||
{
|
||||
out_token_p->value = RE_CONTROL_CHAR_TAB;
|
||||
}
|
||||
else if (ch1 == 'r')
|
||||
{
|
||||
out_token_p->value = RE_CONTROL_CHAR_CR;
|
||||
}
|
||||
else if (ch1 == 'v')
|
||||
{
|
||||
out_token_p->value = RE_CONTROL_CHAR_VT;
|
||||
}
|
||||
else if (ch1 == 'c')
|
||||
{
|
||||
ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2);
|
||||
if ((ch2 >= 'A' && ch2 <= 'Z') || (ch2 >= 'a' && ch2 <= 'z'))
|
||||
{
|
||||
advance = 3;
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
out_token_p->value = (ch2 % 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape");
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (ch1 == 'x'
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2))
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 3)))
|
||||
{
|
||||
advance = 4;
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
/* FIXME: get unicode char from hex-digits */
|
||||
/* result.value = ...; */
|
||||
}
|
||||
else if (ch1 == 'u'
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2))
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 3))
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 4))
|
||||
&& isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 5)))
|
||||
{
|
||||
advance = 4;
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
/* FIXME: get unicode char from digits */
|
||||
/* result.value = ...; */
|
||||
}
|
||||
else if (ch1 == 'd')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_DIGIT;
|
||||
}
|
||||
else if (ch1 == 'D')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_NOT_DIGIT;
|
||||
}
|
||||
else if (ch1 == 's')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_WHITE;
|
||||
}
|
||||
else if (ch1 == 'S')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_NOT_WHITE;
|
||||
}
|
||||
else if (ch1 == 'w')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_WORD_CHAR;
|
||||
}
|
||||
else if (ch1 == 'W')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_NOT_WORD_CHAR;
|
||||
}
|
||||
else if (isdigit (ch1))
|
||||
{
|
||||
if (ch1 == '0')
|
||||
{
|
||||
if (isdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)))
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp escape pattern error.");
|
||||
break;
|
||||
}
|
||||
|
||||
advance = 2;
|
||||
out_token_p->value = RE_CONTROL_CHAR_NUL;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (parser_ctx_p->num_of_groups == -1)
|
||||
{
|
||||
re_count_num_of_groups (parser_ctx_p);
|
||||
}
|
||||
|
||||
if (parser_ctx_p->num_of_groups)
|
||||
{
|
||||
uint32_t number = 0;
|
||||
int index = 0;
|
||||
advance = 0;
|
||||
|
||||
do
|
||||
{
|
||||
if (index >= RE_MAX_RE_DECESC_DIGITS)
|
||||
{
|
||||
ret_value = ecma_raise_syntax_error ((const ecma_char_t *)
|
||||
"RegExp escape pattern error: decimal escape too long.");
|
||||
return ret_value;
|
||||
}
|
||||
|
||||
advance++;
|
||||
ecma_char_t digit = RE_LOOKUP (parser_ctx_p->current_char_p, advance);
|
||||
if (!isdigit (digit))
|
||||
{
|
||||
break;
|
||||
}
|
||||
number = number * 10 + ecma_char_hex_to_int (digit);
|
||||
index++;
|
||||
}
|
||||
while (true);
|
||||
|
||||
if ((int) number <= parser_ctx_p->num_of_groups)
|
||||
{
|
||||
out_token_p->type = RE_TOK_BACKREFERENCE;
|
||||
}
|
||||
|
||||
out_token_p->value = number;
|
||||
}
|
||||
else
|
||||
{
|
||||
out_token_p->value = ch1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
out_token_p->value = ch1;
|
||||
}
|
||||
|
||||
uint32_t iter_adv = 0;
|
||||
ECMA_TRY_CATCH (empty,
|
||||
parse_re_iterator (parser_ctx_p->current_char_p,
|
||||
out_token_p,
|
||||
advance,
|
||||
&iter_adv),
|
||||
ret_value);
|
||||
advance += iter_adv;
|
||||
ECMA_FINALIZE (empty);
|
||||
break;
|
||||
}
|
||||
case '(':
|
||||
{
|
||||
if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '?')
|
||||
{
|
||||
ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2);
|
||||
if (ch2 == '=')
|
||||
{
|
||||
/* (?= */
|
||||
advance = 3;
|
||||
out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD;
|
||||
}
|
||||
else if (ch2 == '!')
|
||||
{
|
||||
/* (?! */
|
||||
advance = 3;
|
||||
out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD;
|
||||
}
|
||||
else if (ch2 == ':')
|
||||
{
|
||||
/* (?: */
|
||||
advance = 3;
|
||||
out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* ( */
|
||||
advance = 1;
|
||||
out_token_p->type = RE_TOK_START_CAPTURE_GROUP;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ')':
|
||||
{
|
||||
ECMA_TRY_CATCH (empty,
|
||||
parse_re_iterator (parser_ctx_p->current_char_p,
|
||||
out_token_p,
|
||||
1,
|
||||
&advance),
|
||||
ret_value);
|
||||
advance += 1;
|
||||
out_token_p->type = RE_TOK_END_GROUP;
|
||||
ECMA_FINALIZE (empty);
|
||||
break;
|
||||
}
|
||||
case '[':
|
||||
{
|
||||
advance = 1;
|
||||
out_token_p->type = RE_TOK_START_CHAR_CLASS;
|
||||
if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '^')
|
||||
{
|
||||
advance = 2;
|
||||
out_token_p->type = RE_TOK_START_INV_CHAR_CLASS;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ']':
|
||||
case '}':
|
||||
case '?':
|
||||
case '*':
|
||||
case '+':
|
||||
case '{':
|
||||
{
|
||||
JERRY_UNREACHABLE ();
|
||||
break;
|
||||
}
|
||||
case '\0':
|
||||
{
|
||||
advance = 0;
|
||||
out_token_p->type = RE_TOK_EOF;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
ECMA_TRY_CATCH (empty,
|
||||
parse_re_iterator (parser_ctx_p->current_char_p,
|
||||
out_token_p,
|
||||
1,
|
||||
&advance),
|
||||
ret_value);
|
||||
advance += 1;
|
||||
out_token_p->type = RE_TOK_CHAR;
|
||||
out_token_p->value = ch0;
|
||||
ECMA_FINALIZE (empty);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
RE_ADVANCE (parser_ctx_p->current_char_p, advance);
|
||||
}
|
||||
|
||||
return ret_value;
|
||||
} /* re_parse_next_token */
|
||||
|
||||
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
|
||||
@@ -0,0 +1,91 @@
|
||||
/* Copyright 2015 Samsung Electronics Co., Ltd.
|
||||
* Copyright 2015 University of Szeged.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef RE_PARSER_H
|
||||
#define RE_PARSER_H
|
||||
|
||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
|
||||
|
||||
#include "opcodes-dumper.h"
|
||||
|
||||
typedef uint8_t token_type_t;
|
||||
|
||||
#define RE_TOK_EOF 0 /* EOF */
|
||||
#define RE_TOK_BACKREFERENCE 1 /* \[0..9] */
|
||||
#define RE_TOK_CHAR 2 /* any character */
|
||||
#define RE_TOK_ALTERNATIVE 3 /* | */
|
||||
#define RE_TOK_ASSERT_START 4 /* ^ */
|
||||
#define RE_TOK_ASSERT_END 5 /* $ */
|
||||
#define RE_TOK_PERIOD 6 /* . */
|
||||
#define RE_TOK_START_CAPTURE_GROUP 7 /* ( */
|
||||
#define RE_TOK_START_NON_CAPTURE_GROUP 8 /* (?: */
|
||||
#define RE_TOK_END_GROUP 9 /* ')' */
|
||||
#define RE_TOK_ASSERT_START_POS_LOOKAHEAD 10 /* (?= */
|
||||
#define RE_TOK_ASSERT_START_NEG_LOOKAHEAD 11 /* (?! */
|
||||
#define RE_TOK_ASSERT_WORD_BOUNDARY 12 /* \b */
|
||||
#define RE_TOK_ASSERT_NOT_WORD_BOUNDARY 13 /* \B */
|
||||
#define RE_TOK_DIGIT 14 /* \d */
|
||||
#define RE_TOK_NOT_DIGIT 15 /* \D */
|
||||
#define RE_TOK_WHITE 16 /* \s */
|
||||
#define RE_TOK_NOT_WHITE 17 /* \S */
|
||||
#define RE_TOK_WORD_CHAR 18 /* \w */
|
||||
#define RE_TOK_NOT_WORD_CHAR 19 /* \W */
|
||||
#define RE_TOK_START_CHAR_CLASS 20 /* [ ] */
|
||||
#define RE_TOK_START_INV_CHAR_CLASS 21 /* [^ ] */
|
||||
|
||||
#define RE_ITERATOR_INFINITE ((uint32_t)-1)
|
||||
#define RE_MAX_RE_DECESC_DIGITS 9
|
||||
|
||||
/* FIXME: depends on unicode support */
|
||||
#define RE_CHAR_UNDEF ((ecma_char_t)-1)
|
||||
|
||||
#define RE_CONTROL_CHAR_NUL 0x0000 /* \0 */
|
||||
#define RE_CONTROL_CHAR_BEL 0x0008 /* \b */
|
||||
#define RE_CONTROL_CHAR_TAB 0x0009 /* \t */
|
||||
#define RE_CONTROL_CHAR_EOL 0x000a /* \n */
|
||||
#define RE_CONTROL_CHAR_VT 0x000b /* \v */
|
||||
#define RE_CONTROL_CHAR_FF 0x000c /* \f */
|
||||
#define RE_CONTROL_CHAR_CR 0x000d /* \r */
|
||||
|
||||
typedef struct
|
||||
{
|
||||
token_type_t type;
|
||||
uint32_t value;
|
||||
uint32_t qmin;
|
||||
uint32_t qmax;
|
||||
bool greedy;
|
||||
} re_token_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ecma_char_t *pattern_start_p;
|
||||
ecma_char_t *current_char_p;
|
||||
int num_of_groups;
|
||||
uint32_t num_of_classes;
|
||||
} re_parser_ctx_t;
|
||||
|
||||
typedef void (*re_char_class_callback) (void *re_ctx_p, uint32_t start, uint32_t end);
|
||||
|
||||
ecma_completion_value_t
|
||||
re_parse_char_class (re_parser_ctx_t *parser_ctx_p,
|
||||
re_char_class_callback append_char_class,
|
||||
void *re_ctx_p, re_token_t *out_token_p);
|
||||
|
||||
ecma_completion_value_t
|
||||
re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p);
|
||||
|
||||
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
|
||||
#endif /* RE_PARSER_H */
|
||||
Reference in New Issue
Block a user