From 4ffcb4d4645cd8a9027cc2efa2fa77f99c396056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Lang=C3=B3?= Date: Thu, 25 Jun 2015 23:51:34 +0300 Subject: [PATCH] Add parser and compiler of regular expressions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JerryScript-DCO-1.0-Signed-off-by: Szilard Ledan szledan.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com --- jerry-core/CMakeLists.txt | 3 + jerry-core/parser/regexp/re-compiler.cpp | 888 +++++++++++++++++++++++ jerry-core/parser/regexp/re-compiler.h | 108 +++ jerry-core/parser/regexp/re-parser.cpp | 808 +++++++++++++++++++++ jerry-core/parser/regexp/re-parser.h | 91 +++ 5 files changed, 1898 insertions(+) create mode 100644 jerry-core/parser/regexp/re-compiler.cpp create mode 100644 jerry-core/parser/regexp/re-compiler.h create mode 100644 jerry-core/parser/regexp/re-parser.cpp create mode 100644 jerry-core/parser/regexp/re-parser.h diff --git a/jerry-core/CMakeLists.txt b/jerry-core/CMakeLists.txt index b33115d3b..2a67fc140 100644 --- a/jerry-core/CMakeLists.txt +++ b/jerry-core/CMakeLists.txt @@ -102,6 +102,7 @@ project (JerryCore CXX C ASM) ${CMAKE_SOURCE_DIR}/jerry-core/ecma/operations ${CMAKE_SOURCE_DIR}/jerry-core/parser/js ${CMAKE_SOURCE_DIR}/jerry-core/parser/js/collections + ${CMAKE_SOURCE_DIR}/jerry-core/parser/regexp ${CMAKE_SOURCE_DIR}/jerry-core/jrt) # Third-party @@ -120,6 +121,7 @@ project (JerryCore CXX C ASM) file(GLOB SOURCE_CORE_ECMA_OPERATIONS ecma/operations/*.cpp) file(GLOB SOURCE_CORE_PARSER_JS parser/js/*.cpp) file(GLOB SOURCE_CORE_PARSER_JS_COLLECTIONS parser/js/collections/*.cpp) + file(GLOB SOURCE_CORE_PARSER_REGEXP parser/regexp/*.cpp) file(GLOB SOURCE_CORE_JRT jrt/*.cpp) set(SOURCE_CORE @@ -134,6 +136,7 @@ project (JerryCore CXX C ASM) ${SOURCE_CORE_ECMA_OPERATIONS} ${SOURCE_CORE_PARSER_JS} ${SOURCE_CORE_PARSER_JS_COLLECTIONS} + ${SOURCE_CORE_PARSER_REGEXP} ${SOURCE_CORE_JRT}) # Per-option configuration diff --git a/jerry-core/parser/regexp/re-compiler.cpp b/jerry-core/parser/regexp/re-compiler.cpp new file mode 100644 index 000000000..f9f5145bc --- /dev/null +++ b/jerry-core/parser/regexp/re-compiler.cpp @@ -0,0 +1,888 @@ +/* Copyright 2015 Samsung Electronics Co., Ltd. + * Copyright 2015 University of Szeged. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ecma-exceptions.h" +#include "ecma-helpers.h" +#include "ecma-try-catch-macro.h" +#include "jrt-libc-includes.h" +#include "mem-heap.h" +#include "re-compiler.h" + +#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN + +/** + * FIXME: + * Add comments to macro definitions in the component + */ + +#define REGEXP_BYTECODE_BLOCK_SIZE 256UL +#define BYTECODE_LEN(bc_ctx_p) ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p)) + +void +regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx); + +/** + * FIXME: + * Add missing 're' prefixes to the component's external and internal interfaces + */ + +/** + * Realloc the bytecode container + */ +static re_bytecode_t* +realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +{ + JERRY_ASSERT (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p >= 0); + size_t old_size = static_cast (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p); + JERRY_ASSERT (!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p); + + size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE; + JERRY_ASSERT (bc_ctx_p->current_p - bc_ctx_p->block_start_p >= 0); + size_t current_ptr_offset = static_cast (bc_ctx_p->current_p - bc_ctx_p->block_start_p); + + re_bytecode_t *new_block_start_p = (re_bytecode_t *) mem_heap_alloc_block (new_block_size, + MEM_HEAP_ALLOC_SHORT_TERM); + if (bc_ctx_p->current_p) + { + memcpy (new_block_start_p, bc_ctx_p->block_start_p, static_cast (current_ptr_offset)); + mem_heap_free_block (bc_ctx_p->block_start_p); + } + bc_ctx_p->block_start_p = new_block_start_p; + bc_ctx_p->block_end_p = new_block_start_p + new_block_size; + bc_ctx_p->current_p = new_block_start_p + current_ptr_offset; + + return bc_ctx_p->current_p; +} /* realloc_regexp_bytecode_block */ + +/** + * Append a new bytecode to the and of the bytecode container + */ +static void +bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + re_bytecode_t *bytecode_p, /**< input bytecode */ + size_t length) /**< length of input */ +{ + JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); + + re_bytecode_t *current_p = bc_ctx_p->current_p; + if (current_p + length > bc_ctx_p->block_end_p) + { + current_p = realloc_regexp_bytecode_block (bc_ctx_p); + } + + memcpy (current_p, bytecode_p, length); + bc_ctx_p->current_p += length; +} /* bytecode_list_append */ + +/** + * Insert a new bytecode to the bytecode container + */ +static void +bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + size_t offset, /**< distance from the start of the container */ + re_bytecode_t *bytecode_p, /**< input bytecode */ + size_t length) /**< length of input */ +{ + JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); + + re_bytecode_t *current_p = bc_ctx_p->current_p; + if (current_p + length > bc_ctx_p->block_end_p) + { + realloc_regexp_bytecode_block (bc_ctx_p); + } + + re_bytecode_t *src_p = bc_ctx_p->block_start_p + offset; + if ((BYTECODE_LEN (bc_ctx_p) - offset) > 0) + { + re_bytecode_t *dest_p = src_p + length; + re_bytecode_t *tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((BYTECODE_LEN (bc_ctx_p) - offset), + MEM_HEAP_ALLOC_SHORT_TERM); + memcpy (tmp_block_start_p, src_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset)); + memcpy (dest_p, tmp_block_start_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset)); + mem_heap_free_block (tmp_block_start_p); + } + memcpy (src_p, bytecode_p, length); + + bc_ctx_p->current_p += length; +} /* bytecode_list_insert */ + +/** + * Append a RegExp opcode + */ +static void +append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + re_opcode_t opcode) /**< input opcode */ +{ + bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); +} /* append_opcode */ + +/** + * Append a parameter of a RegExp opcode + */ +static void +append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t value) /**< input value */ +{ + bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t)); +} /* append_u32 */ + +/** + * Append a jump offset parameter of a RegExp opcode + */ +static void +append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t value) /**< input value */ +{ + value += (uint32_t) (sizeof (uint32_t)); + append_u32 (bc_ctx_p, value); +} /* append_jump_offset */ + +/** + * Insert a RegExp opcode + */ +static void +insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t offset, /**< distance from the start of the container */ + re_opcode_t opcode) /**< input opcode */ +{ + bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); +} /* insert_opcode */ + +/** + * Insert a parameter of a RegExp opcode + */ +static void +insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t offset, /**< distance from the start of the container */ + uint32_t value) /**< input value */ +{ + bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t)); +} /* insert_u32 */ + +/** + * Get a RegExp opcode + */ +re_opcode_t +re_get_opcode (re_bytecode_t **bc_p) /**< pointer to bytecode start */ +{ + re_bytecode_t bytecode = **bc_p; + (*bc_p) += sizeof (re_bytecode_t); + return (re_opcode_t) bytecode; +} /* get_opcode */ + +/** + * Get a parameter of a RegExp opcode + */ +uint32_t +re_get_value (re_bytecode_t **bc_p) /**< pointer to bytecode start */ +{ + uint32_t value = *((uint32_t*) *bc_p); + (*bc_p) += sizeof (uint32_t); + return value; +} /* get_value */ + +/** + * Callback function of character class generation + */ +static void +append_char_class (void* re_ctx_p, /**< RegExp compiler context */ + uint32_t start, /**< character class range from */ + uint32_t end) /**< character class range to */ +{ + /* FIXME: Handle ignore case flag and add unicode support. */ + re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t*) re_ctx_p; + append_u32 (ctx_p->bytecode_ctx_p, start); + append_u32 (ctx_p->bytecode_ctx_p, end); + ctx_p->parser_ctx_p->num_of_classes++; +} /* append_char_class */ + +/** + * Insert simple atom iterator + */ +static void +insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t new_atom_start_offset) /**< atom start offset */ +{ + uint32_t atom_code_length; + uint32_t offset; + uint32_t qmin, qmax; + + qmin = re_ctx_p->current_token.qmin; + qmax = re_ctx_p->current_token.qmax; + JERRY_ASSERT (qmin <= qmax); + + /* FIXME: optimize bytecode length. Store 0 rather than INF */ + + append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */ + uint32_t bytecode_length = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset); + + offset = new_atom_start_offset; + insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length); + insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax); + insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin); + if (re_ctx_p->current_token.greedy) + { + insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR); + } + else + { + insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR); + } +} /* insert_simple_iterator */ + +/** + * Get the type of a group start + */ +static re_opcode_t +get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool is_capturable) /**< is capturabel group */ +{ + if (is_capturable) + { + if (re_ctx_p->current_token.qmin == 0) + { + if (re_ctx_p->current_token.greedy) + { + return RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START; + } + + return RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START; + } + + return RE_OP_CAPTURE_GROUP_START; + } + + if (re_ctx_p->current_token.qmin == 0) + { + if (re_ctx_p->current_token.greedy) + { + return RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START; + } + + return RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START; + } + + return RE_OP_NON_CAPTURE_GROUP_START; + + JERRY_UNREACHABLE (); + return 0; +} /* get_start_opcode_type */ + +/** + * Get the type of a group end + */ +static re_opcode_t +get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool is_capturable) /**< is capturabel group */ +{ + if (is_capturable) + { + if (re_ctx_p->current_token.greedy) + { + return RE_OP_CAPTURE_GREEDY_GROUP_END; + } + + return RE_OP_CAPTURE_NON_GREEDY_GROUP_END; + } + + if (re_ctx_p->current_token.greedy) + { + return RE_OP_NON_CAPTURE_GREEDY_GROUP_END; + } + + return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END; + + JERRY_UNREACHABLE (); + return 0; +} /* get_end_opcode_type */ + +/** + * Enclose the given bytecode to a group + */ +static void +insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t group_start_offset, /**< offset of group start */ + uint32_t idx, /**< index of group */ + bool is_capturable) /**< is capturabel group */ +{ + uint32_t qmin, qmax; + re_opcode_t start_opcode = get_start_opcode_type (re_ctx_p, is_capturable); + re_opcode_t end_opcode = get_end_opcode_type (re_ctx_p, is_capturable); + uint32_t start_head_offset_len; + + qmin = re_ctx_p->current_token.qmin; + qmax = re_ctx_p->current_token.qmax; + JERRY_ASSERT (qmin <= qmax); + + start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx); + insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode); + start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - start_head_offset_len; + append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode); + append_u32 (re_ctx_p->bytecode_ctx_p, idx); + append_u32 (re_ctx_p->bytecode_ctx_p, qmin); + append_u32 (re_ctx_p->bytecode_ctx_p, qmax); + + group_start_offset += start_head_offset_len; + append_jump_offset (re_ctx_p->bytecode_ctx_p, + BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); + + if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START) + { + insert_u32 (re_ctx_p->bytecode_ctx_p, + group_start_offset, + BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); + } +} /* insert_into_group */ + +/** + * Enclose the given bytecode to a group and inster jump value + */ +static void +insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t group_start_offset, /**< offset of group start */ + uint32_t idx, /**< index of group */ + bool is_capturable) /**< is capturabel group */ +{ + insert_u32 (re_ctx_p->bytecode_ctx_p, + group_start_offset, + BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); + insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); +} /* insert_into_group_with_jump */ + +/** + * Parse alternatives + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value + */ +static ecma_completion_value_t +parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool expect_eof) /**< expect end of file */ +{ + uint32_t idx; + re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p; + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + + uint32_t alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + + if (re_ctx_p->recursion_depth >= RE_COMPILE_RECURSION_LIMIT) + { + ret_value = ecma_raise_range_error ((const ecma_char_t *) "RegExp compiler recursion limit is exceeded."); + return ret_value; + } + re_ctx_p->recursion_depth++; + + while (true) + { + ECMA_TRY_CATCH (empty, + re_parse_next_token (re_ctx_p->parser_ctx_p, + &(re_ctx_p->current_token)), + ret_value); + ECMA_FINALIZE (empty); + if (!ecma_is_completion_value_empty (ret_value)) + { + return ret_value; /* error */ + } + uint32_t new_atom_start_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + + switch (re_ctx_p->current_token.type) + { + case RE_TOK_START_CAPTURE_GROUP: + { + idx = re_ctx_p->num_of_captures++; + JERRY_DDLOG ("Compile a capture group start (idx: %d)\n", idx); + + ret_value = parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) + { + insert_into_group (re_ctx_p, new_atom_start_offset, idx, true); + } + else + { + return ret_value; /* error */ + } + break; + } + case RE_TOK_START_NON_CAPTURE_GROUP: + { + idx = re_ctx_p->num_of_non_captures++; + JERRY_DDLOG ("Compile a non-capture group start (idx: %d)\n", idx); + + ret_value = parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) + { + insert_into_group (re_ctx_p, new_atom_start_offset, idx, false); + } + else + { + return ret_value; /* error */ + } + break; + } + case RE_TOK_CHAR: + { + JERRY_DDLOG ("Compile character token: %c, qmin: %d, qmax: %d\n", + re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax); + + append_opcode (bc_ctx_p, RE_OP_CHAR); + append_u32 (bc_ctx_p, re_ctx_p->current_token.value); + + if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) + { + insert_simple_iterator (re_ctx_p, new_atom_start_offset); + } + break; + } + case RE_TOK_PERIOD: + { + JERRY_DDLOG ("Compile a period\n"); + append_opcode (bc_ctx_p, RE_OP_PERIOD); + + if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) + { + insert_simple_iterator (re_ctx_p, new_atom_start_offset); + } + break; + } + case RE_TOK_ALTERNATIVE: + { + JERRY_DDLOG ("Compile an alternative\n"); + insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); + append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); + alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + break; + } + case RE_TOK_ASSERT_START: + { + JERRY_DDLOG ("Compile a start assertion\n"); + append_opcode (bc_ctx_p, RE_OP_ASSERT_START); + break; + } + case RE_TOK_ASSERT_END: + { + JERRY_DDLOG ("Compile an end assertion\n"); + append_opcode (bc_ctx_p, RE_OP_ASSERT_END); + break; + } + case RE_TOK_ASSERT_WORD_BOUNDARY: + { + JERRY_DDLOG ("Compile a word boundary assertion\n"); + append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); + break; + } + case RE_TOK_ASSERT_NOT_WORD_BOUNDARY: + { + JERRY_DDLOG ("Compile a not word boundary assertion\n"); + append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); + break; + } + case RE_TOK_ASSERT_START_POS_LOOKAHEAD: + { + JERRY_DDLOG ("Compile a positive lookahead assertion\n"); + idx = re_ctx_p->num_of_non_captures++; + append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); + + ret_value = parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) + { + append_opcode (bc_ctx_p, RE_OP_MATCH); + + insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + } + else + { + return ret_value; /* error */ + } + break; + } + case RE_TOK_ASSERT_START_NEG_LOOKAHEAD: + { + JERRY_DDLOG ("Compile a negative lookahead assertion\n"); + idx = re_ctx_p->num_of_non_captures++; + append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); + + ret_value = parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) + { + append_opcode (bc_ctx_p, RE_OP_MATCH); + + insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + } + else + { + return ret_value; /* error */ + } + break; + } + case RE_TOK_BACKREFERENCE: + { + uint32_t backref = (uint32_t) re_ctx_p->current_token.value; + idx = re_ctx_p->num_of_non_captures++; + if (backref > re_ctx_p->highest_backref) + { + re_ctx_p->highest_backref = backref; + } + JERRY_DDLOG ("Compile a backreference: %d\n", backref); + append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE); + append_u32 (bc_ctx_p, backref); + + insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + break; + } + case RE_TOK_START_CHAR_CLASS: + case RE_TOK_START_INV_CHAR_CLASS: + { + JERRY_DDLOG ("Compile a character class\n"); + append_opcode (bc_ctx_p, + re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS + ? RE_OP_CHAR_CLASS + : RE_OP_INV_CHAR_CLASS); + uint32_t offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + + ECMA_TRY_CATCH (empty, + re_parse_char_class (re_ctx_p->parser_ctx_p, + append_char_class, + re_ctx_p, + &(re_ctx_p->current_token)), + ret_value); + insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes); + + if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) + { + insert_simple_iterator (re_ctx_p, new_atom_start_offset); + } + ECMA_FINALIZE (empty); + break; + } + case RE_TOK_END_GROUP: + { + JERRY_DDLOG ("Compile a group end\n"); + + if (expect_eof) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of paren."); + } + else + { + insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); + re_ctx_p->recursion_depth--; + } + + return ret_value; + } + case RE_TOK_EOF: + { + if (!expect_eof) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected end of pattern."); + } + else + { + insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); + re_ctx_p->recursion_depth--; + } + + return ret_value; + } + default: + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Unexpected RegExp token."); + return ret_value; + } + } + } + + JERRY_UNREACHABLE (); + return ret_value; +} /* parse_alternative */ + +/** + * Compilation of RegExp bytecode + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value + */ +ecma_completion_value_t +re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ + ecma_string_t *pattern_str_p, /**< pattern */ + uint8_t flags) /**< flags */ +{ + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + re_compiler_ctx_t re_ctx; + re_ctx.flags = flags; + re_ctx.highest_backref = 0; + re_ctx.num_of_non_captures = 0; + re_ctx.recursion_depth = 0; + + re_bytecode_ctx_t bc_ctx; + bc_ctx.block_start_p = NULL; + bc_ctx.block_end_p = NULL; + bc_ctx.current_p = NULL; + + re_ctx.bytecode_ctx_p = &bc_ctx; + + int32_t pattern_str_len = ecma_string_get_length (pattern_str_p); + MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_len + 1, ecma_char_t); + ssize_t zt_str_size = (ssize_t) sizeof (ecma_char_t) * (pattern_str_len + 1); + ecma_string_to_zt_string (pattern_str_p, pattern_start_p, zt_str_size); + + re_parser_ctx_t parser_ctx; + parser_ctx.pattern_start_p = pattern_start_p; + parser_ctx.current_char_p = pattern_start_p; + parser_ctx.num_of_groups = -1; + re_ctx.parser_ctx_p = &parser_ctx; + + /* 1. Parse RegExp pattern */ + re_ctx.num_of_captures = 1; + append_opcode (&bc_ctx, RE_OP_SAVE_AT_START); + + ECMA_TRY_CATCH (empty, parse_alternative (&re_ctx, true), ret_value); + + /* 2. Check for invalid backreference */ + if (re_ctx.highest_backref >= re_ctx.num_of_captures) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "Invalid backreference.\n"); + } + else + { + append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH); + append_opcode (&bc_ctx, RE_OP_EOF); + + /* 3. Insert extra informations for bytecode header */ + insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures); + insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2); + insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags); + } + ECMA_FINALIZE (empty); + + /* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */ + JERRY_ASSERT (bc_ctx.block_start_p != NULL); + ECMA_SET_POINTER (bytecode_p->u.internal_property.value, bc_ctx.block_start_p); + + MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p); + +#ifdef JERRY_ENABLE_LOG + regexp_dump_bytecode (&bc_ctx); +#endif + + return ret_value; +} /* re_compile_bytecode */ + +#ifdef JERRY_ENABLE_LOG +/** + * RegExp bytecode dumper + */ +void +regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) +{ + re_bytecode_t *bytecode_p = bc_ctx_p->block_start_p; + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d | ", re_get_value (&bytecode_p)); + + re_opcode_t op; + while ((op = re_get_opcode (&bytecode_p))) + { + switch (op) + { + case RE_OP_MATCH: + { + JERRY_DLOG ("MATCH, "); + break; + } + case RE_OP_CHAR: + { + JERRY_DLOG ("CHAR "); + JERRY_DLOG ("%c, ", (char) re_get_value (&bytecode_p)); + break; + } + case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START: + { + JERRY_DLOG ("N"); + /* FALLTHRU */ + } + case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START: + { + JERRY_DLOG ("GZ_START "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_CAPTURE_GROUP_START: + { + JERRY_DLOG ("START "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_CAPTURE_NON_GREEDY_GROUP_END: + { + JERRY_DLOG ("N"); + /* FALLTHRU */ + } + case RE_OP_CAPTURE_GREEDY_GROUP_END: + { + JERRY_DLOG ("G_END "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START: + { + JERRY_DLOG ("N"); + /* FALLTHRU */ + } + case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START: + { + JERRY_DLOG ("GZ_NC_START "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_NON_CAPTURE_GROUP_START: + { + JERRY_DLOG ("NC_START "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END: + { + JERRY_DLOG ("N"); + /* FALLTHRU */ + } + case RE_OP_NON_CAPTURE_GREEDY_GROUP_END: + { + JERRY_DLOG ("G_NC_END "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_SAVE_AT_START: + { + JERRY_DLOG ("RE_START "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_SAVE_AND_MATCH: + { + JERRY_DLOG ("RE_END, "); + break; + } + case RE_OP_GREEDY_ITERATOR: + { + JERRY_DLOG ("GREEDY_ITERATOR "); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_NON_GREEDY_ITERATOR: + { + JERRY_DLOG ("NON_GREEDY_ITERATOR "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_PERIOD: + { + JERRY_DLOG ("PERIOD "); + break; + } + case RE_OP_ALTERNATIVE: + { + JERRY_DLOG ("ALTERNATIVE "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_ASSERT_START: + { + JERRY_DLOG ("ASSERT_START "); + break; + } + case RE_OP_ASSERT_END: + { + JERRY_DLOG ("ASSERT_END "); + break; + } + case RE_OP_ASSERT_WORD_BOUNDARY: + { + JERRY_DLOG ("ASSERT_WORD_BOUNDARY "); + break; + } + case RE_OP_ASSERT_NOT_WORD_BOUNDARY: + { + JERRY_DLOG ("ASSERT_NOT_WORD_BOUNDARY "); + break; + } + case RE_OP_LOOKAHEAD_POS: + { + JERRY_DLOG ("LOOKAHEAD_POS "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_LOOKAHEAD_NEG: + { + JERRY_DLOG ("LOOKAHEAD_NEG "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_BACKREFERENCE: + { + JERRY_DLOG ("BACKREFERENCE "); + JERRY_DLOG ("%d, ", re_get_value (&bytecode_p)); + break; + } + case RE_OP_INV_CHAR_CLASS: + { + JERRY_DLOG ("INV_"); + /* FALLTHRU */ + } + case RE_OP_CHAR_CLASS: + { + JERRY_DLOG ("CHAR_CLASS "); + uint32_t num_of_class = re_get_value (&bytecode_p); + JERRY_DLOG ("%d", num_of_class); + while (num_of_class) + { + JERRY_DLOG (" %d", re_get_value (&bytecode_p)); + JERRY_DLOG ("-%d", re_get_value (&bytecode_p)); + num_of_class--; + } + JERRY_DLOG (", "); + break; + } + default: + { + JERRY_DLOG ("UNKNOWN(%d), ", (uint32_t) op); + break; + } + } + } + JERRY_DLOG ("EOF\n"); +} /* regexp_dump_bytecode */ +#endif /* JERRY_ENABLE_LOG */ + +#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */ diff --git a/jerry-core/parser/regexp/re-compiler.h b/jerry-core/parser/regexp/re-compiler.h new file mode 100644 index 000000000..73e4eedab --- /dev/null +++ b/jerry-core/parser/regexp/re-compiler.h @@ -0,0 +1,108 @@ +/* Copyright 2015 Samsung Electronics Co., Ltd. + * Copyright 2015 University of Szeged. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RE_COMPILER_H +#define RE_COMPILER_H + +#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN + +#include "ecma-globals.h" +#include "re-parser.h" + +/* RegExp opcodes + * Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it. + * Change it carfully. Capture opcodes should be at first. + */ +#define RE_OP_EOF 0 + +#define RE_OP_CAPTURE_GROUP_START 1 +#define RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START 2 +#define RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START 3 +#define RE_OP_CAPTURE_GREEDY_GROUP_END 4 +#define RE_OP_CAPTURE_NON_GREEDY_GROUP_END 5 +#define RE_OP_NON_CAPTURE_GROUP_START 6 +#define RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START 7 +#define RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START 8 +#define RE_OP_NON_CAPTURE_GREEDY_GROUP_END 9 +#define RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END 10 + +#define RE_OP_MATCH 11 +#define RE_OP_CHAR 12 +#define RE_OP_SAVE_AT_START 13 +#define RE_OP_SAVE_AND_MATCH 14 +#define RE_OP_PERIOD 15 +#define RE_OP_ALTERNATIVE 16 +#define RE_OP_GREEDY_ITERATOR 17 +#define RE_OP_NON_GREEDY_ITERATOR 18 +#define RE_OP_ASSERT_START 19 +#define RE_OP_ASSERT_END 20 +#define RE_OP_ASSERT_WORD_BOUNDARY 21 +#define RE_OP_ASSERT_NOT_WORD_BOUNDARY 22 +#define RE_OP_LOOKAHEAD_POS 23 +#define RE_OP_LOOKAHEAD_NEG 24 +#define RE_OP_BACKREFERENCE 25 +#define RE_OP_CHAR_CLASS 26 +#define RE_OP_INV_CHAR_CLASS 27 + +#define RE_COMPILE_RECURSION_LIMIT 100 + +#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0) + +typedef uint8_t re_opcode_t; /* type of RegExp opcodes */ +typedef uint8_t re_bytecode_t; /* type of standard bytecode elements (ex.: opcode parameters) */ + +/** + * Context of RegExp bytecode container + * + * FIXME: + * Add comments with description of the structure members + */ +typedef struct +{ + re_bytecode_t *block_start_p; + re_bytecode_t *block_end_p; + re_bytecode_t *current_p; +} re_bytecode_ctx_t; + +/** + * Context of RegExp compiler + * + * FIXME: + * Add comments with description of the structure members + */ +typedef struct +{ + uint8_t flags; + uint32_t recursion_depth; + uint32_t num_of_captures; + uint32_t num_of_non_captures; + uint32_t highest_backref; + re_bytecode_ctx_t *bytecode_ctx_p; + re_token_t current_token; + re_parser_ctx_t *parser_ctx_p; +} re_compiler_ctx_t; + +ecma_completion_value_t +re_compile_bytecode (ecma_property_t *bytecode_p, ecma_string_t *pattern_str_p, uint8_t flags); + +re_opcode_t +re_get_opcode (re_bytecode_t **bc_p); + +uint32_t +re_get_value (re_bytecode_t **bc_p); + +#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */ +#endif /* RE_COMPILER_H */ diff --git a/jerry-core/parser/regexp/re-parser.cpp b/jerry-core/parser/regexp/re-parser.cpp new file mode 100644 index 000000000..51ed3a8c9 --- /dev/null +++ b/jerry-core/parser/regexp/re-parser.cpp @@ -0,0 +1,808 @@ +/* Copyright 2015 Samsung Electronics Co., Ltd. + * Copyright 2015 University of Szeged. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ecma-exceptions.h" +#include "ecma-globals.h" +#include "ecma-helpers.h" +#include "ecma-try-catch-macro.h" +#include "jrt-libc-includes.h" +#include "re-parser.h" +#include "syntax-errors.h" + +#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN + +/* FIXME: change it, when unicode support would be implemented */ +#define RE_LOOKUP(str_p, lookup) (ecma_zt_string_length (str_p) > lookup ? str_p[lookup] : '\0') + +/* FIXME: change it, when unicode support would be implemented */ +#define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0) + +static ecma_char_t +get_ecma_char (ecma_char_t** char_p) +{ + /* FIXME: change to string iterator with unicode support, when it would be implemented */ + ecma_char_t ch = **char_p; + RE_ADVANCE (*char_p, 1); + return ch; +} /* get_ecma_char */ + +/** + * Parse RegExp iterators + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value + */ +static ecma_completion_value_t +parse_re_iterator (ecma_char_t *pattern_p, /**< RegExp pattern */ + re_token_t *re_token_p, /**< output token */ + uint32_t lookup, /**< size of lookup */ + uint32_t *advance_p) /**< output length of current advance */ +{ + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + + ecma_char_t ch0 = RE_LOOKUP (pattern_p, lookup); + ecma_char_t ch1 = RE_LOOKUP (pattern_p, lookup + 1); + + switch (ch0) + { + case '?': + { + re_token_p->qmin = 0; + re_token_p->qmax = 1; + if (ch1 == '?') + { + *advance_p = 2; + re_token_p->greedy = false; + } + else + { + *advance_p = 1; + re_token_p->greedy = true; + } + break; + } + case '*': + { + re_token_p->qmin = 0; + re_token_p->qmax = RE_ITERATOR_INFINITE; + if (ch1 == '?') + { + *advance_p = 2; + re_token_p->greedy = false; + } + else + { + *advance_p = 1; + re_token_p->greedy = true; + } + break; + } + case '+': + { + re_token_p->qmin = 1; + re_token_p->qmax = RE_ITERATOR_INFINITE; + if (ch1 == '?') + { + *advance_p = 2; + re_token_p->greedy = false; + } + else + { + *advance_p = 1; + re_token_p->greedy = true; + } + break; + } + case '{': + { + uint32_t qmin = 0; + uint32_t qmax = RE_ITERATOR_INFINITE; + uint32_t digits = 0; + while (true) + { + (*advance_p)++; + ch1 = RE_LOOKUP (pattern_p, lookup + *advance_p); + + if (isdigit (ch1)) + { + if (digits >= ECMA_NUMBER_MAX_DIGITS) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: too many digits."); + return ret_value; + } + digits++; + qmin = qmin * 10 + ecma_char_hex_to_int (ch1); + } + else if (ch1 == ',') + { + if (qmax != RE_ITERATOR_INFINITE) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: double comma."); + return ret_value; + } + if ((RE_LOOKUP (pattern_p, lookup + *advance_p + 1)) == '}') + { + if (digits == 0) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits."); + return ret_value; + } + + re_token_p->qmin = qmin; + re_token_p->qmax = RE_ITERATOR_INFINITE; + *advance_p += 2; + break; + } + qmax = qmin; + qmin = 0; + digits = 0; + } + else if (ch1 == '}') + { + if (digits == 0) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: missing digits."); + return ret_value; + } + + if (qmax != RE_ITERATOR_INFINITE) + { + re_token_p->qmin = qmax; + re_token_p->qmax = qmin; + } + else + { + re_token_p->qmin = qmin; + re_token_p->qmax = qmin; + } + + *advance_p += 1; + break; + } + else + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: unknown char."); + return ret_value; + } + } + + if ((RE_LOOKUP (pattern_p, lookup + *advance_p)) == '?') + { + re_token_p->greedy = false; + *advance_p += 1; + } + else + { + re_token_p->greedy = true; + } + break; + + JERRY_UNREACHABLE (); + break; + } + default: + { + re_token_p->qmin = 1; + re_token_p->qmax = 1; + re_token_p->greedy = true; + break; + } + } + + JERRY_ASSERT (ecma_is_completion_value_empty (ret_value)); + + if (re_token_p->qmin > re_token_p->qmax) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp quantifier error: qmin > qmax."); + } + + return ret_value; +} /* parse_re_iterator */ + +/** + * Count the number of groups in pattern + */ +static void +re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ +{ + ecma_char_t *pattern_p = parser_ctx_p->pattern_start_p; + ecma_char_t ch1; + int char_class_in = 0; + parser_ctx_p->num_of_groups = 0; + + ch1 = get_ecma_char (&pattern_p); + while (ch1 != '\0') + { + ecma_char_t ch0 = ch1; + ch1 = get_ecma_char (&pattern_p); + switch (ch0) + { + case '\\': + { + ch1 = get_ecma_char (&pattern_p); + break; + } + case '[': + { + char_class_in++; + break; + } + case ']': + { + if (!char_class_in) + { + char_class_in--; + } + break; + } + case '(': + { + if (ch1 != '?' && !char_class_in) + { + parser_ctx_p->num_of_groups++; + } + break; + } + } + } +} /* re_count_num_of_groups */ + +/** + * Read the input pattern and parse the range of character class + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value + */ +ecma_completion_value_t +re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ + re_char_class_callback append_char_class, /**< callback function, + * which adds the char-ranges + * to the bytecode */ + void* re_ctx_p, /**< regexp compiler context */ + re_token_t *out_token_p) /**< output token */ +{ + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + ecma_char_t **pattern_p = &(parser_ctx_p->current_char_p); + + out_token_p->qmax = out_token_p->qmin = 1; + ecma_char_t start = RE_CHAR_UNDEF; + bool is_range = false; + parser_ctx_p->num_of_classes = 0; + + do + { + ecma_char_t ch = get_ecma_char (pattern_p); + if (ch == ']') + { + if (start != RE_CHAR_UNDEF) + { + append_char_class (re_ctx_p, start, start); + } + break; + } + else if (ch == '-') + { + if (start != RE_CHAR_UNDEF && !is_range && RE_LOOKUP (*pattern_p, 0) != ']') + { + is_range = true; + continue; + } + } + else if (ch == '\\') + { + ch = get_ecma_char (pattern_p); + + if (ch == 'b') + { + ch = RE_CONTROL_CHAR_BEL; + } + else if (ch == 'f') + { + ch = RE_CONTROL_CHAR_FF; + } + else if (ch == 'n') + { + ch = RE_CONTROL_CHAR_EOL; + } + else if (ch == 't') + { + ch = RE_CONTROL_CHAR_TAB; + } + else if (ch == 'r') + { + ch = RE_CONTROL_CHAR_CR; + } + else if (ch == 'v') + { + ch = RE_CONTROL_CHAR_VT; + } + else if (ch == 'c') + { + ch = get_ecma_char (pattern_p); + if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) + { + ch = (ch % 32); + } + else + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape"); + return ret_value; + } + } + else if (ch == 'x') + { + /* FIXME: get unicode char from hex-digits */ + /* ch = ...; */ + } + else if (ch == 'u') + { + /* FIXME: get unicode char from digits */ + /* ch = ...; */ + } + else if (ch == 'd') + { + /* append digits from '0' to '9'. */ + append_char_class (re_ctx_p, 0x0030UL, 0x0039UL); + ch = RE_CHAR_UNDEF; + } + else if (ch == 'D') + { + append_char_class (re_ctx_p, 0x0000UL, 0x002FUL); + append_char_class (re_ctx_p, 0x003AUL, 0xFFFFUL); + ch = RE_CHAR_UNDEF; + } + else if (ch == 's') + { + append_char_class (re_ctx_p, 0x0009UL, 0x000DUL); + append_char_class (re_ctx_p, 0x0020UL, 0x0020UL); + append_char_class (re_ctx_p, 0x00A0UL, 0x00A0UL); + append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); + append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); + append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); + append_char_class (re_ctx_p, 0x2028UL, 0x2029UL); + append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); + append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); + append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); + append_char_class (re_ctx_p, 0xFEFFUL, 0xFEFFUL); + ch = RE_CHAR_UNDEF; + } + else if (ch == 'S') + { + append_char_class (re_ctx_p, 0x0000UL, 0x0008UL); + append_char_class (re_ctx_p, 0x000EUL, 0x001FUL); + append_char_class (re_ctx_p, 0x0021UL, 0x009FUL); + append_char_class (re_ctx_p, 0x00A1UL, 0x167FUL); + append_char_class (re_ctx_p, 0x1681UL, 0x180DUL); + append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL); + append_char_class (re_ctx_p, 0x200BUL, 0x2027UL); + append_char_class (re_ctx_p, 0x202AUL, 0x202EUL); + append_char_class (re_ctx_p, 0x2030UL, 0x205EUL); + append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL); + append_char_class (re_ctx_p, 0x3001UL, 0xFEFEUL); + append_char_class (re_ctx_p, 0xFF00UL, 0xFFFFUL); + ch = RE_CHAR_UNDEF; + } + else if (ch == 'w') + { + append_char_class (re_ctx_p, 0x0030UL, 0x0039UL); + append_char_class (re_ctx_p, 0x0041UL, 0x005AUL); + append_char_class (re_ctx_p, 0x005FUL, 0x005FUL); + append_char_class (re_ctx_p, 0x0061UL, 0x007AUL); + ch = RE_CHAR_UNDEF; + } + else if (ch == 'W') + { + append_char_class (re_ctx_p, 0x0000UL, 0x002FUL); + append_char_class (re_ctx_p, 0x003AUL, 0x0040UL); + append_char_class (re_ctx_p, 0x005BUL, 0x005EUL); + append_char_class (re_ctx_p, 0x0060UL, 0x0060UL); + append_char_class (re_ctx_p, 0x007BUL, 0xFFFFUL); + ch = RE_CHAR_UNDEF; + } + else if (isdigit (ch)) + { + if (ch != '\0' || isdigit (RE_LOOKUP (*pattern_p, 1))) + { + /* FIXME: octal support */ + } + } + /* FIXME: depends on the unicode support + else if (!jerry_unicode_identifier (ch)) + { + JERRY_ERROR_MSG ("RegExp escape pattern error. (Char class)"); + } + */ + } + + if (ch == RE_CHAR_UNDEF) + { + if (start != RE_CHAR_UNDEF) + { + if (is_range) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range"); + return ret_value; + } + else + { + append_char_class (re_ctx_p, start, start); + start = RE_CHAR_UNDEF; + } + } + } + else + { + if (start != RE_CHAR_UNDEF) + { + if (is_range) + { + if (start > ch) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid character class range"); + return ret_value; + } + else + { + append_char_class (re_ctx_p, start, ch); + start = RE_CHAR_UNDEF; + is_range = false; + } + } + else + { + append_char_class (re_ctx_p, start, start); + start = ch; + } + } + else + { + start = ch; + } + } + } + while (true); + + uint32_t advance = 0; + ECMA_TRY_CATCH (empty, + parse_re_iterator (parser_ctx_p->current_char_p, + out_token_p, + 0, + &advance), + ret_value); + RE_ADVANCE (parser_ctx_p->current_char_p, advance); + ECMA_FINALIZE (empty); + + return ret_value; +} /* re_parse_char_class */ + +/** + * Read the input pattern and parse the next token for the RegExp compiler + * + * @return completion value + * Returned value must be freed with ecma_free_completion_value + */ +ecma_completion_value_t +re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ + re_token_t *out_token_p) /**< output token */ +{ + ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + uint32_t advance = 0; + ecma_char_t ch0 = *(parser_ctx_p->current_char_p); + + switch (ch0) + { + case '|': + { + advance = 1; + out_token_p->type = RE_TOK_ALTERNATIVE; + break; + } + case '^': + { + advance = 1; + out_token_p->type = RE_TOK_ASSERT_START; + break; + } + case '$': + { + advance = 1; + out_token_p->type = RE_TOK_ASSERT_END; + break; + } + case '.': + { + ECMA_TRY_CATCH (empty, + parse_re_iterator (parser_ctx_p->current_char_p, + out_token_p, + 1, + &advance), + ret_value); + advance += 1; + out_token_p->type = RE_TOK_PERIOD; + ECMA_FINALIZE (empty); + break; + } + case '\\': + { + advance = 2; + out_token_p->type = RE_TOK_CHAR; + ecma_char_t ch1 = RE_LOOKUP (parser_ctx_p->current_char_p, 1); + + if (ch1 == 'b') + { + out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY; + } + else if (ch1 == 'B') + { + out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY; + } + else if (ch1 == 'f') + { + out_token_p->value = RE_CONTROL_CHAR_FF; + } + else if (ch1 == 'n') + { + out_token_p->value = RE_CONTROL_CHAR_EOL; + } + else if (ch1 == 't') + { + out_token_p->value = RE_CONTROL_CHAR_TAB; + } + else if (ch1 == 'r') + { + out_token_p->value = RE_CONTROL_CHAR_CR; + } + else if (ch1 == 'v') + { + out_token_p->value = RE_CONTROL_CHAR_VT; + } + else if (ch1 == 'c') + { + ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + if ((ch2 >= 'A' && ch2 <= 'Z') || (ch2 >= 'a' && ch2 <= 'z')) + { + advance = 3; + out_token_p->type = RE_TOK_CHAR; + out_token_p->value = (ch2 % 32); + } + else + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "invalid regexp control escape"); + break; + } + } + else if (ch1 == 'x' + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)) + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 3))) + { + advance = 4; + out_token_p->type = RE_TOK_CHAR; + /* FIXME: get unicode char from hex-digits */ + /* result.value = ...; */ + } + else if (ch1 == 'u' + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)) + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 3)) + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 4)) + && isxdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 5))) + { + advance = 4; + out_token_p->type = RE_TOK_CHAR; + /* FIXME: get unicode char from digits */ + /* result.value = ...; */ + } + else if (ch1 == 'd') + { + advance = 2; + out_token_p->type = RE_TOK_DIGIT; + } + else if (ch1 == 'D') + { + advance = 2; + out_token_p->type = RE_TOK_NOT_DIGIT; + } + else if (ch1 == 's') + { + advance = 2; + out_token_p->type = RE_TOK_WHITE; + } + else if (ch1 == 'S') + { + advance = 2; + out_token_p->type = RE_TOK_NOT_WHITE; + } + else if (ch1 == 'w') + { + advance = 2; + out_token_p->type = RE_TOK_WORD_CHAR; + } + else if (ch1 == 'W') + { + advance = 2; + out_token_p->type = RE_TOK_NOT_WORD_CHAR; + } + else if (isdigit (ch1)) + { + if (ch1 == '0') + { + if (isdigit (RE_LOOKUP (parser_ctx_p->current_char_p, 2))) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) "RegExp escape pattern error."); + break; + } + + advance = 2; + out_token_p->value = RE_CONTROL_CHAR_NUL; + } + else + { + if (parser_ctx_p->num_of_groups == -1) + { + re_count_num_of_groups (parser_ctx_p); + } + + if (parser_ctx_p->num_of_groups) + { + uint32_t number = 0; + int index = 0; + advance = 0; + + do + { + if (index >= RE_MAX_RE_DECESC_DIGITS) + { + ret_value = ecma_raise_syntax_error ((const ecma_char_t *) + "RegExp escape pattern error: decimal escape too long."); + return ret_value; + } + + advance++; + ecma_char_t digit = RE_LOOKUP (parser_ctx_p->current_char_p, advance); + if (!isdigit (digit)) + { + break; + } + number = number * 10 + ecma_char_hex_to_int (digit); + index++; + } + while (true); + + if ((int) number <= parser_ctx_p->num_of_groups) + { + out_token_p->type = RE_TOK_BACKREFERENCE; + } + + out_token_p->value = number; + } + else + { + out_token_p->value = ch1; + } + } + } + else + { + out_token_p->value = ch1; + } + + uint32_t iter_adv = 0; + ECMA_TRY_CATCH (empty, + parse_re_iterator (parser_ctx_p->current_char_p, + out_token_p, + advance, + &iter_adv), + ret_value); + advance += iter_adv; + ECMA_FINALIZE (empty); + break; + } + case '(': + { + if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '?') + { + ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + if (ch2 == '=') + { + /* (?= */ + advance = 3; + out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD; + } + else if (ch2 == '!') + { + /* (?! */ + advance = 3; + out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD; + } + else if (ch2 == ':') + { + /* (?: */ + advance = 3; + out_token_p->type = RE_TOK_START_NON_CAPTURE_GROUP; + } + } + else + { + /* ( */ + advance = 1; + out_token_p->type = RE_TOK_START_CAPTURE_GROUP; + } + break; + } + case ')': + { + ECMA_TRY_CATCH (empty, + parse_re_iterator (parser_ctx_p->current_char_p, + out_token_p, + 1, + &advance), + ret_value); + advance += 1; + out_token_p->type = RE_TOK_END_GROUP; + ECMA_FINALIZE (empty); + break; + } + case '[': + { + advance = 1; + out_token_p->type = RE_TOK_START_CHAR_CLASS; + if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '^') + { + advance = 2; + out_token_p->type = RE_TOK_START_INV_CHAR_CLASS; + } + break; + } + case ']': + case '}': + case '?': + case '*': + case '+': + case '{': + { + JERRY_UNREACHABLE (); + break; + } + case '\0': + { + advance = 0; + out_token_p->type = RE_TOK_EOF; + break; + } + default: + { + ECMA_TRY_CATCH (empty, + parse_re_iterator (parser_ctx_p->current_char_p, + out_token_p, + 1, + &advance), + ret_value); + advance += 1; + out_token_p->type = RE_TOK_CHAR; + out_token_p->value = ch0; + ECMA_FINALIZE (empty); + break; + } + } + + if (ecma_is_completion_value_empty (ret_value)) + { + RE_ADVANCE (parser_ctx_p->current_char_p, advance); + } + + return ret_value; +} /* re_parse_next_token */ + +#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */ diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h new file mode 100644 index 000000000..160cbce7c --- /dev/null +++ b/jerry-core/parser/regexp/re-parser.h @@ -0,0 +1,91 @@ +/* Copyright 2015 Samsung Electronics Co., Ltd. + * Copyright 2015 University of Szeged. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RE_PARSER_H +#define RE_PARSER_H + +#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN + +#include "opcodes-dumper.h" + +typedef uint8_t token_type_t; + +#define RE_TOK_EOF 0 /* EOF */ +#define RE_TOK_BACKREFERENCE 1 /* \[0..9] */ +#define RE_TOK_CHAR 2 /* any character */ +#define RE_TOK_ALTERNATIVE 3 /* | */ +#define RE_TOK_ASSERT_START 4 /* ^ */ +#define RE_TOK_ASSERT_END 5 /* $ */ +#define RE_TOK_PERIOD 6 /* . */ +#define RE_TOK_START_CAPTURE_GROUP 7 /* ( */ +#define RE_TOK_START_NON_CAPTURE_GROUP 8 /* (?: */ +#define RE_TOK_END_GROUP 9 /* ')' */ +#define RE_TOK_ASSERT_START_POS_LOOKAHEAD 10 /* (?= */ +#define RE_TOK_ASSERT_START_NEG_LOOKAHEAD 11 /* (?! */ +#define RE_TOK_ASSERT_WORD_BOUNDARY 12 /* \b */ +#define RE_TOK_ASSERT_NOT_WORD_BOUNDARY 13 /* \B */ +#define RE_TOK_DIGIT 14 /* \d */ +#define RE_TOK_NOT_DIGIT 15 /* \D */ +#define RE_TOK_WHITE 16 /* \s */ +#define RE_TOK_NOT_WHITE 17 /* \S */ +#define RE_TOK_WORD_CHAR 18 /* \w */ +#define RE_TOK_NOT_WORD_CHAR 19 /* \W */ +#define RE_TOK_START_CHAR_CLASS 20 /* [ ] */ +#define RE_TOK_START_INV_CHAR_CLASS 21 /* [^ ] */ + +#define RE_ITERATOR_INFINITE ((uint32_t)-1) +#define RE_MAX_RE_DECESC_DIGITS 9 + +/* FIXME: depends on unicode support */ +#define RE_CHAR_UNDEF ((ecma_char_t)-1) + +#define RE_CONTROL_CHAR_NUL 0x0000 /* \0 */ +#define RE_CONTROL_CHAR_BEL 0x0008 /* \b */ +#define RE_CONTROL_CHAR_TAB 0x0009 /* \t */ +#define RE_CONTROL_CHAR_EOL 0x000a /* \n */ +#define RE_CONTROL_CHAR_VT 0x000b /* \v */ +#define RE_CONTROL_CHAR_FF 0x000c /* \f */ +#define RE_CONTROL_CHAR_CR 0x000d /* \r */ + +typedef struct +{ + token_type_t type; + uint32_t value; + uint32_t qmin; + uint32_t qmax; + bool greedy; +} re_token_t; + +typedef struct +{ + ecma_char_t *pattern_start_p; + ecma_char_t *current_char_p; + int num_of_groups; + uint32_t num_of_classes; +} re_parser_ctx_t; + +typedef void (*re_char_class_callback) (void *re_ctx_p, uint32_t start, uint32_t end); + +ecma_completion_value_t +re_parse_char_class (re_parser_ctx_t *parser_ctx_p, + re_char_class_callback append_char_class, + void *re_ctx_p, re_token_t *out_token_p); + +ecma_completion_value_t +re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p); + +#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */ +#endif /* RE_PARSER_H */