RegExp refactoring and improvements

Move RegExp bytecode functions to a separate file.
Optimize bytecode lenght on character matching.
Implement a basic RegExp cache to optimize memory
usage on duplicated RegExp in JS files. Also fix
minor style issues and add missing comments. Improve
existing comments.

JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com
This commit is contained in:
László Langó 2016-02-18 10:23:45 +00:00
parent 3f377692d9
commit 2c72bb1139
16 changed files with 844 additions and 601 deletions

View File

@ -32,6 +32,7 @@
#include "jrt.h"
#include "jrt-libc-includes.h"
#include "jrt-bit-fields.h"
#include "re-compiler.h"
#include "vm-defines.h"
#include "vm-stack.h"
@ -549,6 +550,11 @@ ecma_gc_run (void)
ecma_gc_objects_lists[ECMA_GC_COLOR_BLACK] = NULL;
ecma_gc_visited_flip_flag = !ecma_gc_visited_flip_flag;
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
/* Free RegExp bytecodes stored in cache */
re_cache_gc_run ();
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
} /* ecma_gc_run */
/**

View File

@ -53,7 +53,9 @@
* See also:
* ECMA-262 v5, B.2.5.1
*
* @return ecma value
* @return undefined - if compiled successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value.
*/
static ecma_value_t
@ -240,8 +242,10 @@ ecma_builtin_regexp_prototype_compile (ecma_value_t this_arg, /**< this argument
* See also:
* ECMA-262 v5, 15.10.6.2
*
* @return ecma value
* Returned value must be freed with ecma_free_value.
* @return array object containing the results - if the matched
* null - otherwise
*
* May raise error, so returned value must be freed with ecma_free_value.
*/
static ecma_value_t
ecma_builtin_regexp_prototype_exec (ecma_value_t this_arg, /**< this argument */
@ -314,8 +318,10 @@ ecma_builtin_regexp_prototype_exec (ecma_value_t this_arg, /**< this argument */
* See also:
* ECMA-262 v5, 15.10.6.3
*
* @return ecma value
* Returned value must be freed with ecma_free_value.
* @return true - if match is not null
* false - otherwise
*
* May raise error, so returned value must be freed with ecma_free_value.
*/
static ecma_value_t
ecma_builtin_regexp_prototype_test (ecma_value_t this_arg, /**< this argument */
@ -439,4 +445,4 @@ ecma_builtin_regexp_prototype_to_string (ecma_value_t this_arg) /**< this argume
* @}
*/
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -46,6 +46,7 @@
* Handle calling [[Call]] of built-in RegExp object
*
* @return ecma value
* Returned value must be freed with ecma_free_value.
*/
ecma_value_t
ecma_builtin_regexp_dispatch_call (const ecma_value_t *arguments_list_p, /**< arguments list */
@ -58,6 +59,7 @@ ecma_builtin_regexp_dispatch_call (const ecma_value_t *arguments_list_p, /**< ar
* Handle calling [[Construct]] of built-in RegExp object
*
* @return ecma value
* Returned value must be freed with ecma_free_value.
*/
ecma_value_t
ecma_builtin_regexp_dispatch_construct (const ecma_value_t *arguments_list_p, /**< arguments list */
@ -152,4 +154,4 @@ ecma_builtin_regexp_dispatch_construct (const ecma_value_t *arguments_list_p, /*
* @}
*/
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -73,7 +73,7 @@ ROUTINE (LIT_MAGIC_STRING_MATCH, ecma_builtin_string_prototype_object_match, 1,
ROUTINE (LIT_MAGIC_STRING_REPLACE, ecma_builtin_string_prototype_object_replace, 2, 2)
ROUTINE (LIT_MAGIC_STRING_SEARCH, ecma_builtin_string_prototype_object_search, 1, 1)
ROUTINE (LIT_MAGIC_STRING_SPLIT, ecma_builtin_string_prototype_object_split, 2, 2)
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
ROUTINE (LIT_MAGIC_STRING_SUBSTRING, ecma_builtin_string_prototype_object_substring, 2, 2)
ROUTINE (LIT_MAGIC_STRING_TO_LOWER_CASE_UL, ecma_builtin_string_prototype_object_to_lower_case, 0, 0)
@ -84,7 +84,7 @@ ROUTINE (LIT_MAGIC_STRING_TRIM, ecma_builtin_string_prototype_object_trim, 0, 0)
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_ANNEXB_BUILTIN
ROUTINE (LIT_MAGIC_STRING_SUBSTR, ecma_builtin_string_prototype_object_substr, 2, 2)
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_ANNEXB_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_ANNEXB_BUILTIN */
#undef OBJECT_ID
#undef SIMPLE_VALUE

View File

@ -958,13 +958,13 @@ ecma_object_get_class_name (ecma_object_t *obj_p) /**< object */
{
return LIT_MAGIC_STRING_DATE_UL;
}
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_DATE_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_DATE_BUILTIN */
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
case ECMA_BUILTIN_ID_REGEXP_PROTOTYPE:
{
return LIT_MAGIC_STRING_REGEXP_UL;
}
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
default:
{
JERRY_ASSERT (ecma_builtin_is (obj_p, ECMA_BUILTIN_ID_GLOBAL));

View File

@ -55,12 +55,19 @@
#define RE_GLOBAL_START_IDX 0
#define RE_GLOBAL_END_IDX 1
/**
* Check if a RegExp opcode is a capture group or not
*/
#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
/**
* Parse RegExp flags (global, ignoreCase, multiline)
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return ecma value
* @return empty ecma value - if parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
@ -123,7 +130,7 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags
return ret_value;
} /* re_parse_regexp_flags */
/*
/**
* Initializes the source, global, ignoreCase, multiline, and lastIndex properties of RegExp instance.
*/
void
@ -223,11 +230,11 @@ re_initialize_props (ecma_object_t *re_obj_p, /**< RegExp obejct */
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return ecma value
* @return constructed RegExp object
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p) /**< input pattern */
ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p) /**< RegExp bytecode */
{
JERRY_ASSERT (bytecode_p != NULL);
@ -259,7 +266,9 @@ ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p) /**<
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return ecma value
* @return constructed RegExp object - if pattern and flags were parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
@ -367,8 +376,10 @@ re_canonicalize (ecma_char_t ch, /**< character */
* See also:
* ECMA-262 v5, 15.10.2.1
*
* @return ecma value
* Returned value must be freed with ecma_free_value
* @return true - if matched
* false - otherwise
*
* May raise error, so returned value must be freed with ecma_free_value
*/
static ecma_value_t
re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
@ -400,7 +411,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
ecma_char_t ch1 = (ecma_char_t) re_get_value (&bc_p); /* Already canonicalized. */
ecma_char_t ch1 = (ecma_char_t) re_get_char (&bc_p); /* Already canonicalized. */
ecma_char_t ch2 = re_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase);
JERRY_DDLOG ("Character matching %d to %d: ", ch1, ch2);
@ -613,8 +624,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
while (num_of_ranges)
{
ecma_char_t ch1 = re_canonicalize ((ecma_char_t) re_get_value (&bc_p), is_ignorecase);
ecma_char_t ch2 = re_canonicalize ((ecma_char_t) re_get_value (&bc_p), is_ignorecase);
ecma_char_t ch1 = re_canonicalize (re_get_char (&bc_p), is_ignorecase);
ecma_char_t ch2 = re_canonicalize (re_get_char (&bc_p), is_ignorecase);
JERRY_DDLOG ("num_of_ranges=%d, ch1=%d, ch2=%d, curr_ch=%d; ",
num_of_ranges, ch1, ch2, curr_ch);
@ -698,6 +709,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
uint32_t offset = re_get_value (&bc_p);
lit_utf8_byte_t *sub_str_p = NULL;
ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*out_str_p = sub_str_p;
@ -707,6 +719,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
{
return match_value;
}
bc_p += offset;
old_bc_p = bc_p;
}
@ -839,6 +852,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
{
offset = re_get_value (&bc_p);
ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*out_str_p = sub_str_p;
@ -848,6 +862,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
{
return match_value;
}
bc_p += offset;
old_bc_p = bc_p;
}
@ -915,6 +930,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
lit_utf8_byte_t *sub_str_p = NULL;
ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*out_str_p = sub_str_p;
@ -1225,7 +1241,13 @@ re_set_result_array_properties (ecma_object_t *array_obj_p, /**< result array */
* RegExp helper function to start the recursive matching algorithm
* and create the result Array object
*
* @return ecma value
* See also:
* ECMA-262 v5, 15.10.6.2
*
* @return array object - if matched
* null - otherwise
*
* May raise error.
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
@ -1475,4 +1497,4 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
* @}
*/
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -32,48 +32,39 @@
/**
* RegExp flags
*/
#define RE_FLAG_GLOBAL (1u << 1) /* ECMA-262 v5, 15.10.7.2 */
#define RE_FLAG_IGNORE_CASE (1u << 2) /* ECMA-262 v5, 15.10.7.3 */
#define RE_FLAG_MULTILINE (1u << 3) /* ECMA-262 v5, 15.10.7.4 */
typedef enum
{
RE_FLAG_GLOBAL = (1u << 1), /**< ECMA-262 v5, 15.10.7.2 */
RE_FLAG_IGNORE_CASE = (1u << 2), /**< ECMA-262 v5, 15.10.7.3 */
RE_FLAG_MULTILINE = (1u << 3) /**< ECMA-262 v5, 15.10.7.4 */
} re_flags_t;
/**
* RegExp executor context
*/
typedef struct
{
lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */
lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */
const lit_utf8_byte_t *input_start_p; /**< start of input pattern string */
const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */
uint32_t num_of_captures; /**< number of capture groups */
uint32_t num_of_non_captures; /**< number of non-capture groups */
uint32_t *num_of_iterations_p; /**< number of iterations */
uint16_t flags; /**< RegExp flags */
const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */
uint32_t num_of_captures; /**< number of capture groups */
uint32_t num_of_non_captures; /**< number of non-capture groups */
uint32_t *num_of_iterations_p; /**< number of iterations */
uint16_t flags; /**< RegExp flags */
} re_matcher_ctx_t;
extern ecma_value_t
ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *);
extern ecma_value_t
ecma_op_create_regexp_object (ecma_string_t *, ecma_string_t *);
extern ecma_value_t
ecma_regexp_exec_helper (ecma_value_t, ecma_value_t, bool);
extern ecma_char_t
re_canonicalize (ecma_char_t, bool);
extern void
re_set_result_array_properties (ecma_object_t *, ecma_string_t *, uint32_t, int32_t);
extern ecma_value_t
re_parse_regexp_flags (ecma_string_t *, uint16_t *);
extern void
re_initialize_props (ecma_object_t *, ecma_string_t *, uint16_t);
ecma_value_t ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *);
ecma_value_t ecma_op_create_regexp_object (ecma_string_t *, ecma_string_t *);
ecma_value_t ecma_regexp_exec_helper (ecma_value_t, ecma_value_t, bool);
ecma_char_t re_canonicalize (ecma_char_t, bool);
void re_set_result_array_properties (ecma_object_t *, ecma_string_t *, uint32_t, int32_t);
ecma_value_t re_parse_regexp_flags (ecma_string_t *, uint16_t *);
void re_initialize_props (ecma_object_t *, ecma_string_t *, uint16_t);
/**
* @}
* @}
*/
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !ECMA_REGEXP_OBJECT_H */

View File

@ -1655,9 +1655,9 @@ jerry_cleanup (void)
bool is_show_mem_stats = ((jerry_flags & JERRY_FLAG_MEM_STATS) != 0);
vm_finalize ();
ecma_finalize ();
lit_finalize ();
vm_finalize ();
mem_finalize (is_show_mem_stats);
} /* jerry_cleanup */

View File

@ -1812,7 +1812,6 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */
ecma_value_t completion_value;
ecma_string_t *pattern_str_p = ecma_new_ecma_string_from_utf8 (regex_start_p, length);
// FIXME: check return value of 're_compile_bytecode' and throw an error
completion_value = re_compile_bytecode (&re_bytecode_p,
pattern_str_p,
current_flags);

View File

@ -0,0 +1,445 @@
/* Copyright 2016 Samsung Electronics Co., Ltd.
* Copyright 2016 University of Szeged.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ecma-globals.h"
#include "re-bytecode.h"
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
/** \addtogroup parser Parser
* @{
*
* \addtogroup regexparser Regular expression
* @{
*
* \addtogroup regexparser_bytecode Bytecode
* @{
*/
/**
* Size of block of RegExp bytecode. Used for allocation
*/
#define REGEXP_BYTECODE_BLOCK_SIZE 256UL
/**
* Realloc the bytecode container
*
* @return current position in RegExp bytecode
*/
static uint8_t *
re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p);
size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
/* If one of the members of RegExp bytecode context is NULL, then all member should be NULL
* (it means first allocation), otherwise all of the members should be a non NULL pointer. */
JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p)
|| (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p));
size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE;
JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p);
size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p);
uint8_t *new_block_start_p = (uint8_t *) mem_heap_alloc_block_store_size (new_block_size);
if (bc_ctx_p->current_p)
{
memcpy (new_block_start_p, bc_ctx_p->block_start_p, (size_t) (current_ptr_offset));
mem_heap_free_block_size_stored (bc_ctx_p->block_start_p);
}
bc_ctx_p->block_start_p = new_block_start_p;
bc_ctx_p->block_end_p = new_block_start_p + new_block_size;
bc_ctx_p->current_p = new_block_start_p + current_ptr_offset;
return bc_ctx_p->current_p;
} /* re_realloc_regexp_bytecode_block */
/**
* Append a new bytecode to the and of the bytecode container
*/
static void
re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint8_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
uint8_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
current_p = re_realloc_regexp_bytecode_block (bc_ctx_p);
}
memcpy (current_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* re_bytecode_list_append */
/**
* Insert a new bytecode to the bytecode container
*/
void
re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
size_t offset, /**< distance from the start of the container */
uint8_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
uint8_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
re_realloc_regexp_bytecode_block (bc_ctx_p);
}
uint8_t *src_p = bc_ctx_p->block_start_p + offset;
if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0)
{
uint8_t *dest_p = src_p + length;
uint8_t *tmp_block_start_p;
tmp_block_start_p = (uint8_t *) mem_heap_alloc_block_store_size (re_get_bytecode_length (bc_ctx_p) - offset);
memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
mem_heap_free_block_size_stored (tmp_block_start_p);
}
memcpy (src_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* re_bytecode_list_insert */
/**
* Get a character from the RegExp bytecode and increase the bytecode position
*
* @return ecma character
*/
ecma_char_t __attr_always_inline___
re_get_char (uint8_t **bc_p) /**< pointer to bytecode start */
{
ecma_char_t chr = *((ecma_char_t *) *bc_p);
(*bc_p) += sizeof (ecma_char_t);
return chr;
} /* re_get_char */
/**
* Get a RegExp opcode and increase the bytecode position
*
* @return current RegExp opcode
*/
re_opcode_t __attr_always_inline___
re_get_opcode (uint8_t **bc_p) /**< pointer to bytecode start */
{
uint8_t bytecode = **bc_p;
(*bc_p) += sizeof (uint8_t);
return (re_opcode_t) bytecode;
} /* re_get_opcode */
/**
* Get a parameter of a RegExp opcode and increase the bytecode position
*
* @return opcode parameter
*/
uint32_t __attr_always_inline___
re_get_value (uint8_t **bc_p) /**< pointer to bytecode start */
{
uint32_t value = *((uint32_t *) *bc_p);
(*bc_p) += sizeof (uint32_t);
return value;
} /* re_get_value */
/**
* Get length of bytecode
*
* @return bytecode length (unsigned integer)
*/
uint32_t __attr_pure___ __attr_always_inline___
re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p));
} /* re_get_bytecode_length */
/**
* Append a RegExp opcode
*/
void
re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_opcode_t opcode) /**< input opcode */
{
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &opcode, sizeof (uint8_t));
} /* re_append_opcode */
/**
* Append a parameter of a RegExp opcode
*/
void
re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &value, sizeof (uint32_t));
} /* re_append_u32 */
/**
* Append a character to the RegExp bytecode
*/
void
re_append_char (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
ecma_char_t input_char) /**< input char */
{
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &input_char, sizeof (ecma_char_t));
} /* re_append_char */
/**
* Append a jump offset parameter of a RegExp opcode
*/
void
re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
value += (uint32_t) (sizeof (uint32_t));
re_append_u32 (bc_ctx_p, value);
} /* re_append_jump_offset */
/**
* Insert a RegExp opcode
*/
void
re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
re_opcode_t opcode) /**< input opcode */
{
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &opcode, sizeof (uint8_t));
} /* re_insert_opcode */
/**
* Insert a parameter of a RegExp opcode
*/
void
re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
uint32_t value) /**< input value */
{
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &value, sizeof (uint32_t));
} /* re_insert_u32 */
#ifdef JERRY_ENABLE_LOG
/**
* RegExp bytecode dumper
*/
void
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
re_compiled_code_t *compiled_code_p = bc_ctx_p->block_start_p;
JERRY_DLOG ("%d ", compiled_code_p->flags);
JERRY_DLOG ("%d ", compiled_code_p->num_of_captures);
JERRY_DLOG ("%d | ", compiled_code_p->num_of_non_captures);
uint8_t *bytecode_p = (uint8_t *) (compiled_code_p + 1);
re_opcode_t op;
while ((op = re_get_opcode (&bytecode_p)))
{
switch (op)
{
case RE_OP_MATCH:
{
JERRY_DLOG ("MATCH, ");
break;
}
case RE_OP_CHAR:
{
JERRY_DLOG ("CHAR ");
JERRY_DLOG ("%c, ", (char) re_get_char (&bytecode_p));
break;
}
case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("GZ_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_CAPTURE_GROUP_START:
{
JERRY_DLOG ("START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_CAPTURE_GREEDY_GROUP_END:
{
JERRY_DLOG ("G_END ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("GZ_NC_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_GROUP_START:
{
JERRY_DLOG ("NC_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
{
JERRY_DLOG ("G_NC_END ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_SAVE_AT_START:
{
JERRY_DLOG ("RE_START ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_SAVE_AND_MATCH:
{
JERRY_DLOG ("RE_END, ");
break;
}
case RE_OP_GREEDY_ITERATOR:
{
JERRY_DLOG ("GREEDY_ITERATOR ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_GREEDY_ITERATOR:
{
JERRY_DLOG ("NON_GREEDY_ITERATOR ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_PERIOD:
{
JERRY_DLOG ("PERIOD ");
break;
}
case RE_OP_ALTERNATIVE:
{
JERRY_DLOG ("ALTERNATIVE ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_ASSERT_START:
{
JERRY_DLOG ("ASSERT_START ");
break;
}
case RE_OP_ASSERT_END:
{
JERRY_DLOG ("ASSERT_END ");
break;
}
case RE_OP_ASSERT_WORD_BOUNDARY:
{
JERRY_DLOG ("ASSERT_WORD_BOUNDARY ");
break;
}
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
{
JERRY_DLOG ("ASSERT_NOT_WORD_BOUNDARY ");
break;
}
case RE_OP_LOOKAHEAD_POS:
{
JERRY_DLOG ("LOOKAHEAD_POS ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_LOOKAHEAD_NEG:
{
JERRY_DLOG ("LOOKAHEAD_NEG ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_BACKREFERENCE:
{
JERRY_DLOG ("BACKREFERENCE ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_INV_CHAR_CLASS:
{
JERRY_DLOG ("INV_");
/* FALLTHRU */
}
case RE_OP_CHAR_CLASS:
{
JERRY_DLOG ("CHAR_CLASS ");
uint32_t num_of_class = re_get_value (&bytecode_p);
JERRY_DLOG ("%d", num_of_class);
while (num_of_class)
{
JERRY_DLOG (" %d", re_get_char (&bytecode_p));
JERRY_DLOG ("-%d", re_get_char (&bytecode_p));
num_of_class--;
}
JERRY_DLOG (", ");
break;
}
default:
{
JERRY_DLOG ("UNKNOWN(%d), ", (uint32_t) op);
break;
}
}
}
JERRY_DLOG ("EOF\n");
} /* re_dump_bytecode */
#endif /* JERRY_ENABLE_LOG */
/**
* @}
* @}
* @}
*/
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -0,0 +1,129 @@
/* Copyright 2016 Samsung Electronics Co., Ltd.
* Copyright 2016 University of Szeged.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef RE_BYTECODE_H
#define RE_BYTECODE_H
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
#include "ecma-globals.h"
/** \addtogroup parser Parser
* @{
*
* \addtogroup regexparser Regular expression
* @{
*
* \addtogroup regexparser_bytecode Bytecode
* @{
*/
/**
* Size of the RegExp bytecode cache
*/
#define RE_CACHE_SIZE 8u
/**
* RegExp flags mask (first 10 bits are for reference count and the rest for the actual RegExp flags)
*/
#define RE_FLAGS_MASK 0x3F
/**
* RegExp opcodes
*/
typedef enum
{
RE_OP_EOF,
/* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
* Change it carefully. Capture opcodes should be at first.
*/
RE_OP_CAPTURE_GROUP_START, /**< group start */
RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */
RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */
RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */
RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */
RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */
RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */
RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */
RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */
RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */
RE_OP_MATCH, /**< match */
RE_OP_CHAR, /**< any character */
RE_OP_SAVE_AT_START, /**< save at start */
RE_OP_SAVE_AND_MATCH, /**< save and match */
RE_OP_PERIOD, /**< "." */
RE_OP_ALTERNATIVE, /**< "|" */
RE_OP_GREEDY_ITERATOR, /**< greedy iterator */
RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */
RE_OP_ASSERT_START, /**< "^" */
RE_OP_ASSERT_END, /**< "$" */
RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */
RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
RE_OP_LOOKAHEAD_POS, /**< lookahead pos */
RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */
RE_OP_BACKREFERENCE, /**< \[0..9] */
RE_OP_CHAR_CLASS, /**< [ ] */
RE_OP_INV_CHAR_CLASS /**< [^ ] */
} re_opcode_t;
/**
* Compiled byte code data.
*/
typedef struct
{
uint16_t flags; /**< RegExp flags */
mem_cpointer_t pattern_cp; /**< original RegExp pattern */
uint32_t num_of_captures; /**< number of capturing brackets */
uint32_t num_of_non_captures; /**< number of non capturing brackets */
} re_compiled_code_t;
/**
* Context of RegExp bytecode container
*/
typedef struct
{
uint8_t *block_start_p; /**< start of bytecode block */
uint8_t *block_end_p; /**< end of bytecode block */
uint8_t *current_p; /**< current position in bytecode */
} re_bytecode_ctx_t;
re_opcode_t re_get_opcode (uint8_t **);
ecma_char_t re_get_char (uint8_t **);
uint32_t re_get_value (uint8_t **);
uint32_t re_get_bytecode_length (re_bytecode_ctx_t *);
void re_append_opcode (re_bytecode_ctx_t *, re_opcode_t);
void re_append_u32 (re_bytecode_ctx_t *, uint32_t);
void re_append_char (re_bytecode_ctx_t *, ecma_char_t);
void re_append_jump_offset (re_bytecode_ctx_t *, uint32_t);
void re_insert_opcode (re_bytecode_ctx_t *, uint32_t, re_opcode_t);
void re_insert_u32 (re_bytecode_ctx_t *, uint32_t, uint32_t);
void re_bytecode_list_insert (re_bytecode_ctx_t *, size_t, uint8_t *, size_t);
#ifdef JERRY_ENABLE_LOG
void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
#endif
/**
* @}
* @}
* @}
*/
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !RE_BYTECODE_H */

View File

@ -20,6 +20,7 @@
#include "ecma-try-catch-macro.h"
#include "jrt-libc-includes.h"
#include "mem-heap.h"
#include "re-bytecode.h"
#include "re-compiler.h"
#include "re-parser.h"
@ -31,201 +32,21 @@
* \addtogroup regexparser Regular expression
* @{
*
* \addtogroup regexparser_bytecode Bytecode
* \addtogroup regexparser_compiler Compiler
* @{
*/
/**
* Size of block of RegExp bytecode. Used for allocation
*/
#define REGEXP_BYTECODE_BLOCK_SIZE 256UL
/**
* Get length of bytecode
*/
static uint32_t
re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p));
} /* re_get_bytecode_length */
void
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
/**
* Realloc the bytecode container
*
* @return current position in RegExp bytecode
*/
static uint8_t *
re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p);
size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
/* If one of the members of RegExp bytecode context is NULL, then all member should be NULL
* (it means first allocation), otherwise all of the members should be a non NULL pointer. */
JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p)
|| (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p));
size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE;
JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p);
size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p);
uint8_t *new_block_start_p = (uint8_t *) mem_heap_alloc_block_store_size (new_block_size);
if (bc_ctx_p->current_p)
{
memcpy (new_block_start_p, bc_ctx_p->block_start_p, (size_t) (current_ptr_offset));
mem_heap_free_block_size_stored (bc_ctx_p->block_start_p);
}
bc_ctx_p->block_start_p = new_block_start_p;
bc_ctx_p->block_end_p = new_block_start_p + new_block_size;
bc_ctx_p->current_p = new_block_start_p + current_ptr_offset;
return bc_ctx_p->current_p;
} /* re_realloc_regexp_bytecode_block */
/**
* Append a new bytecode to the and of the bytecode container
*/
static void
re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint8_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
uint8_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
current_p = re_realloc_regexp_bytecode_block (bc_ctx_p);
}
memcpy (current_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* re_bytecode_list_append */
/**
* Insert a new bytecode to the bytecode container
*/
static void
re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
size_t offset, /**< distance from the start of the container */
uint8_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
uint8_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
re_realloc_regexp_bytecode_block (bc_ctx_p);
}
uint8_t *src_p = bc_ctx_p->block_start_p + offset;
if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0)
{
uint8_t *dest_p = src_p + length;
uint8_t *tmp_block_start_p;
tmp_block_start_p = (uint8_t *) mem_heap_alloc_block_store_size (re_get_bytecode_length (bc_ctx_p) - offset);
memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
mem_heap_free_block_size_stored (tmp_block_start_p);
}
memcpy (src_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* re_bytecode_list_insert */
/**
* Append a RegExp opcode
*/
static void
re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_opcode_t opcode) /**< input opcode */
{
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &opcode, sizeof (uint8_t));
} /* re_append_opcode */
/**
* Append a parameter of a RegExp opcode
*/
static void
re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
re_bytecode_list_append (bc_ctx_p, (uint8_t *) &value, sizeof (uint32_t));
} /* re_append_u32 */
/**
* Append a jump offset parameter of a RegExp opcode
*/
static void
re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
value += (uint32_t) (sizeof (uint32_t));
re_append_u32 (bc_ctx_p, value);
} /* re_append_jump_offset */
/**
* Insert a RegExp opcode
*/
static void
re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
re_opcode_t opcode) /**< input opcode */
{
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &opcode, sizeof (uint8_t));
} /* re_insert_opcode */
/**
* Insert a parameter of a RegExp opcode
*/
static void
re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
uint32_t value) /**< input value */
{
re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &value, sizeof (uint32_t));
} /* re_insert_u32 */
/**
* Get a RegExp opcode
*/
re_opcode_t
re_get_opcode (uint8_t **bc_p) /**< pointer to bytecode start */
{
uint8_t bytecode = **bc_p;
(*bc_p) += sizeof (uint8_t);
return (re_opcode_t) bytecode;
} /* re_get_opcode */
/**
* Get a parameter of a RegExp opcode
*/
uint32_t
re_get_value (uint8_t **bc_p) /**< pointer to bytecode start */
{
uint32_t value = *((uint32_t *) *bc_p);
(*bc_p) += sizeof (uint32_t);
return value;
} /* re_get_value */
/**
* Callback function of character class generation
*/
static void
re_append_char_class (void *re_ctx_p, /**< RegExp compiler context */
uint32_t start, /**< character class range from */
uint32_t end) /**< character class range to */
ecma_char_t start, /**< character class range from */
ecma_char_t end) /**< character class range to */
{
/* FIXME: Handle ignore case flag and add unicode support. */
re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t *) re_ctx_p;
re_append_u32 (ctx_p->bytecode_ctx_p, start);
re_append_u32 (ctx_p->bytecode_ctx_p, end);
re_append_char (ctx_p->bytecode_ctx_p, start);
re_append_char (ctx_p->bytecode_ctx_p, end);
ctx_p->parser_ctx_p->num_of_classes++;
} /* re_append_char_class */
@ -266,6 +87,8 @@ re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler con
/**
* Get the type of a group start
*
* @return RegExp opcode
*/
static re_opcode_t
re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
@ -301,6 +124,8 @@ re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler cont
/**
* Get the type of a group end
*
* @return RegExp opcode
*/
static re_opcode_t
re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
@ -378,17 +203,12 @@ re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compile
re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
} /* re_insert_into_group_with_jump */
/**
* @}
*
* \addtogroup regexparser_compiler Compiler
* @{
*/
/**
* Parse alternatives
*
* @return completion value
* @return empty ecma value - if alternative was successfully parsed
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
static ecma_value_t
@ -447,7 +267,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context
re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax);
re_append_opcode (bc_ctx_p, RE_OP_CHAR);
re_append_u32 (bc_ctx_p, re_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
re_append_char (bc_ctx_p, re_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
@ -624,14 +444,79 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context
return ret_value;
} /* re_parse_alternative */
static const re_compiled_code_t *re_cache[RE_CACHE_SIZE];
/**
* Search for the given pattern in the RegExp cache
*
* @return compiled bytecode - if found
* NULL - otherwise
*/
re_compiled_code_t *
re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, /**< pattern string */
uint16_t flags, /**< flags */
uint32_t *idx) /**< [out] index */
{
uint32_t free_idx = RE_CACHE_SIZE;
for (*idx = 0u; *idx < RE_CACHE_SIZE; (*idx)++)
{
re_compiled_code_t *cached_bytecode_p = re_cache[*idx];
if (cached_bytecode_p != NULL)
{
ecma_string_t *cached_pattern_str_p;
cached_pattern_str_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, cached_bytecode_p->pattern_cp);
if ((cached_bytecode_p->flags & RE_FLAGS_MASK) == flags
&& ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p))
{
JERRY_DDLOG ("RegExp is found in cache\n");
return re_cache[*idx];
}
}
else
{
/* mark as free, so it can be overridden if the cache is full */
free_idx = *idx;
}
}
JERRY_DDLOG ("RegExp is NOT found in cache\n");
*idx = free_idx;
return NULL;
} /* re_find_bytecode_in_cache */
/**
* Run gerbage collection in RegExp cache
*/
void
re_cache_gc_run ()
{
for (uint32_t i = 0u; i < RE_CACHE_SIZE; i++)
{
re_compiled_code_t *cached_bytecode_p = re_cache[i];
if (cached_bytecode_p != NULL
&& (cached_bytecode_p->flags >> ECMA_BYTECODE_REF_SHIFT) == 1)
{ /* Only the cache has reference for the bytecode */
ecma_bytecode_deref (cached_bytecode_p);
re_cache[i] = NULL;
}
}
} /* re_cache_gc_run */
/**
* Compilation of RegExp bytecode
*
* @return completion value
* @return empty ecma value - if bytecode was compiled successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
re_compile_bytecode (re_compiled_code_t **out_bytecode_p, /**< [out]pointer to bytecode */
re_compile_bytecode (re_compiled_code_t **out_bytecode_p, /**< [out] pointer to bytecode */
ecma_string_t *pattern_str_p, /**< pattern */
uint16_t flags) /**< flags */
{
@ -648,64 +533,85 @@ re_compile_bytecode (re_compiled_code_t **out_bytecode_p, /**< [out]pointer to b
re_ctx.bytecode_ctx_p = &bc_ctx;
lit_utf8_size_t pattern_str_size = ecma_string_get_size (pattern_str_p);
MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_size, lit_utf8_byte_t);
uint32_t cache_idx;
*out_bytecode_p = re_find_bytecode_in_cache (pattern_str_p, flags, &cache_idx);
ssize_t sz = ecma_string_to_utf8_string (pattern_str_p, pattern_start_p, (ssize_t) pattern_str_size);
JERRY_ASSERT (sz >= 0);
re_parser_ctx_t parser_ctx;
parser_ctx.input_start_p = pattern_start_p;
parser_ctx.input_curr_p = pattern_start_p;
parser_ctx.input_end_p = pattern_start_p + pattern_str_size;
parser_ctx.num_of_groups = -1;
re_ctx.parser_ctx_p = &parser_ctx;
/* 1. Parse RegExp pattern */
re_ctx.num_of_captures = 1;
re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
ECMA_TRY_CATCH (empty, re_parse_alternative (&re_ctx, true), ret_value);
/* 2. Check for invalid backreference */
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
if (*out_bytecode_p != NULL)
{
ret_value = ecma_raise_syntax_error ("Invalid backreference.\n");
ecma_bytecode_ref ((ecma_compiled_code_t *) *out_bytecode_p);
}
else
{
re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
re_append_opcode (&bc_ctx, RE_OP_EOF);
{ /* not in the RegExp cache, so compile it */
lit_utf8_size_t pattern_str_size = ecma_string_get_size (pattern_str_p);
MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_size, lit_utf8_byte_t);
/* 3. Insert extra informations for bytecode header */
re_compiled_code_t re_compiled_code;
ssize_t sz = ecma_string_to_utf8_string (pattern_str_p, pattern_start_p, (ssize_t) pattern_str_size);
JERRY_ASSERT (sz >= 0);
re_compiled_code.flags = re_ctx.flags | (1u << ECMA_BYTECODE_REF_SHIFT);
ECMA_SET_NON_NULL_POINTER (re_compiled_code.pattern_cp,
ecma_copy_or_ref_ecma_string (pattern_str_p));
re_compiled_code.num_of_captures = re_ctx.num_of_captures * 2;
re_compiled_code.num_of_non_captures = re_ctx.num_of_non_captures;
re_parser_ctx_t parser_ctx;
parser_ctx.input_start_p = pattern_start_p;
parser_ctx.input_curr_p = pattern_start_p;
parser_ctx.input_end_p = pattern_start_p + pattern_str_size;
parser_ctx.num_of_groups = -1;
re_ctx.parser_ctx_p = &parser_ctx;
re_bytecode_list_insert (&bc_ctx,
0,
(uint8_t *) &re_compiled_code,
sizeof (re_compiled_code_t));
}
ECMA_FINALIZE (empty);
/* 1. Parse RegExp pattern */
re_ctx.num_of_captures = 1;
re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p);
ECMA_TRY_CATCH (empty, re_parse_alternative (&re_ctx, true), ret_value);
if (!ecma_is_value_empty (ret_value))
{
/* Compilation failed, free bytecode. */
mem_heap_free_block_size_stored (bc_ctx.block_start_p);
*out_bytecode_p = NULL;
}
else
{
/* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */
JERRY_ASSERT (bc_ctx.block_start_p != NULL);
*out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p;
/* 2. Check for invalid backreference */
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
{
ret_value = ecma_raise_syntax_error ("Invalid backreference.\n");
}
else
{
re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
re_append_opcode (&bc_ctx, RE_OP_EOF);
/* 3. Insert extra informations for bytecode header */
re_compiled_code_t re_compiled_code;
re_compiled_code.flags = re_ctx.flags | (1u << ECMA_BYTECODE_REF_SHIFT);
ECMA_SET_NON_NULL_POINTER (re_compiled_code.pattern_cp,
ecma_copy_or_ref_ecma_string (pattern_str_p));
re_compiled_code.num_of_captures = re_ctx.num_of_captures * 2;
re_compiled_code.num_of_non_captures = re_ctx.num_of_non_captures;
re_bytecode_list_insert (&bc_ctx,
0,
(uint8_t *) &re_compiled_code,
sizeof (re_compiled_code_t));
}
ECMA_FINALIZE (empty);
MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p);
if (!ecma_is_value_empty (ret_value))
{
/* Compilation failed, free bytecode. */
mem_heap_free_block_size_stored (bc_ctx.block_start_p);
*out_bytecode_p = NULL;
}
else
{
/* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */
JERRY_ASSERT (bc_ctx.block_start_p != NULL);
*out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p;
if (cache_idx < RE_CACHE_SIZE)
{
ecma_bytecode_ref (*out_bytecode_p);
re_cache[cache_idx] = *out_bytecode_p;
}
else
{
JERRY_DDLOG ("RegExp cache is full! Cannot add new bytecode to it.");
}
}
}
#ifdef JERRY_ENABLE_LOG
@ -715,214 +621,10 @@ re_compile_bytecode (re_compiled_code_t **out_bytecode_p, /**< [out]pointer to b
return ret_value;
} /* re_compile_bytecode */
#ifdef JERRY_ENABLE_LOG
/**
* RegExp bytecode dumper
*/
void
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
re_compiled_code_t *compiled_code_p = bc_ctx_p->block_start_p;
JERRY_DLOG ("%d ", compiled_code_p->flags);
JERRY_DLOG ("%d ", compiled_code_p->num_of_captures);
JERRY_DLOG ("%d | ", compiled_code_p->num_of_non_captures);
uint8_t *bytecode_p = (uint8_t *) (compiled_code_p + 1);
re_opcode_t op;
while ((op = re_get_opcode (&bytecode_p)))
{
switch (op)
{
case RE_OP_MATCH:
{
JERRY_DLOG ("MATCH, ");
break;
}
case RE_OP_CHAR:
{
JERRY_DLOG ("CHAR ");
JERRY_DLOG ("%c, ", (char) re_get_value (&bytecode_p));
break;
}
case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("GZ_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_CAPTURE_GROUP_START:
{
JERRY_DLOG ("START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_CAPTURE_GREEDY_GROUP_END:
{
JERRY_DLOG ("G_END ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
{
JERRY_DLOG ("GZ_NC_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_GROUP_START:
{
JERRY_DLOG ("NC_START ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
{
JERRY_DLOG ("N");
/* FALLTHRU */
}
case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
{
JERRY_DLOG ("G_NC_END ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_SAVE_AT_START:
{
JERRY_DLOG ("RE_START ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_SAVE_AND_MATCH:
{
JERRY_DLOG ("RE_END, ");
break;
}
case RE_OP_GREEDY_ITERATOR:
{
JERRY_DLOG ("GREEDY_ITERATOR ");
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_NON_GREEDY_ITERATOR:
{
JERRY_DLOG ("NON_GREEDY_ITERATOR ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_PERIOD:
{
JERRY_DLOG ("PERIOD ");
break;
}
case RE_OP_ALTERNATIVE:
{
JERRY_DLOG ("ALTERNATIVE ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_ASSERT_START:
{
JERRY_DLOG ("ASSERT_START ");
break;
}
case RE_OP_ASSERT_END:
{
JERRY_DLOG ("ASSERT_END ");
break;
}
case RE_OP_ASSERT_WORD_BOUNDARY:
{
JERRY_DLOG ("ASSERT_WORD_BOUNDARY ");
break;
}
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
{
JERRY_DLOG ("ASSERT_NOT_WORD_BOUNDARY ");
break;
}
case RE_OP_LOOKAHEAD_POS:
{
JERRY_DLOG ("LOOKAHEAD_POS ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_LOOKAHEAD_NEG:
{
JERRY_DLOG ("LOOKAHEAD_NEG ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_BACKREFERENCE:
{
JERRY_DLOG ("BACKREFERENCE ");
JERRY_DLOG ("%d, ", re_get_value (&bytecode_p));
break;
}
case RE_OP_INV_CHAR_CLASS:
{
JERRY_DLOG ("INV_");
/* FALLTHRU */
}
case RE_OP_CHAR_CLASS:
{
JERRY_DLOG ("CHAR_CLASS ");
uint32_t num_of_class = re_get_value (&bytecode_p);
JERRY_DLOG ("%d", num_of_class);
while (num_of_class)
{
JERRY_DLOG (" %d", re_get_value (&bytecode_p));
JERRY_DLOG ("-%d", re_get_value (&bytecode_p));
num_of_class--;
}
JERRY_DLOG (", ");
break;
}
default:
{
JERRY_DLOG ("UNKNOWN(%d), ", (uint32_t) op);
break;
}
}
}
JERRY_DLOG ("EOF\n");
} /* re_dump_bytecode */
#endif /* JERRY_ENABLE_LOG */
/**
* @}
* @}
* @}
*/
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -20,6 +20,7 @@
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
#include "ecma-globals.h"
#include "re-bytecode.h"
#include "re-parser.h"
/** \addtogroup parser Parser
@ -32,71 +33,6 @@
* @{
*/
/**
* RegExp opcodes
*/
typedef enum
{
RE_OP_EOF,
/* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
* Change it carefully. Capture opcodes should be at first.
*/
RE_OP_CAPTURE_GROUP_START, /**< group start */
RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, /**< greedy zero group start */
RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-greedy zero group start */
RE_OP_CAPTURE_GREEDY_GROUP_END, /**< greedy group end */
RE_OP_CAPTURE_NON_GREEDY_GROUP_END, /**< non-greedy group end */
RE_OP_NON_CAPTURE_GROUP_START, /**< non-capture group start */
RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, /**< non-capture greedy zero group start */
RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, /**< non-capture non-greedy zero group start */
RE_OP_NON_CAPTURE_GREEDY_GROUP_END, /**< non-capture greedy group end */
RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, /**< non-capture non-greedy group end */
RE_OP_MATCH, /**< match */
RE_OP_CHAR, /**< any character */
RE_OP_SAVE_AT_START, /**< save at start */
RE_OP_SAVE_AND_MATCH, /**< save and match */
RE_OP_PERIOD, /**< "." */
RE_OP_ALTERNATIVE, /**< "|" */
RE_OP_GREEDY_ITERATOR, /**< greedy iterator */
RE_OP_NON_GREEDY_ITERATOR, /**< non-greedy iterator */
RE_OP_ASSERT_START, /**< "^" */
RE_OP_ASSERT_END, /**< "$" */
RE_OP_ASSERT_WORD_BOUNDARY, /**< "\b" */
RE_OP_ASSERT_NOT_WORD_BOUNDARY, /**< "\B" */
RE_OP_LOOKAHEAD_POS, /**< lookahead pos */
RE_OP_LOOKAHEAD_NEG, /**< lookahead neg */
RE_OP_BACKREFERENCE, /**< \[0..9] */
RE_OP_CHAR_CLASS, /**< [ ] */
RE_OP_INV_CHAR_CLASS /**< [^ ] */
} re_opcode_t;
/**
* Compiled byte code data.
*/
typedef struct
{
uint16_t flags; /**< RegExp flags */
mem_cpointer_t pattern_cp; /**< original RegExp pattern */
uint32_t num_of_captures; /**< number of capturing brackets */
uint32_t num_of_non_captures; /**< number of non capturing brackets */
} re_compiled_code_t;
/**
* Check if a RegExp opcode is a capture group or not
*/
#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
/**
* Context of RegExp bytecode container
*/
typedef struct
{
uint8_t *block_start_p; /**< start of bytecode block */
uint8_t *block_end_p; /**< end of bytecode block */
uint8_t *current_p; /**< current position in bytecode */
} re_bytecode_ctx_t;
/**
* Context of RegExp compiler
*/
@ -114,11 +50,10 @@ typedef struct
ecma_value_t
re_compile_bytecode (re_compiled_code_t **, ecma_string_t *, uint16_t);
re_opcode_t
re_get_opcode (uint8_t **);
re_compiled_code_t *
re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, uint16_t flags, uint32_t *idx);
uint32_t
re_get_value (uint8_t **);
void re_cache_gc_run ();
/**
* @}

View File

@ -105,7 +105,9 @@ re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */
/**
* Parse RegExp iterators
*
* @return completion value
* @return empty ecma value - if parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
static ecma_value_t
@ -298,7 +300,9 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex
/**
* Read the input pattern and parse the range of character class
*
* @return completion value
* @return empty ecma value - if parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
@ -556,7 +560,9 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
/**
* Read the input pattern and parse the next token for the RegExp compiler
*
* @return completion value
* @return empty ecma value - if parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t

View File

@ -34,28 +34,28 @@
*/
typedef enum
{
RE_TOK_EOF, /* EOF */
RE_TOK_BACKREFERENCE, /* \[0..9] */
RE_TOK_CHAR, /* any character */
RE_TOK_ALTERNATIVE, /* | */
RE_TOK_ASSERT_START, /* ^ */
RE_TOK_ASSERT_END, /* $ */
RE_TOK_PERIOD, /* . */
RE_TOK_START_CAPTURE_GROUP, /* ( */
RE_TOK_START_NON_CAPTURE_GROUP, /* (?: */
RE_TOK_END_GROUP, /* ')' */
RE_TOK_ASSERT_START_POS_LOOKAHEAD, /* (?= */
RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /* (?! */
RE_TOK_ASSERT_WORD_BOUNDARY, /* \b */
RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /* \B */
RE_TOK_DIGIT, /* \d */
RE_TOK_NOT_DIGIT, /* \D */
RE_TOK_WHITE, /* \s */
RE_TOK_NOT_WHITE, /* \S */
RE_TOK_WORD_CHAR, /* \w */
RE_TOK_NOT_WORD_CHAR, /* \W */
RE_TOK_START_CHAR_CLASS, /* [ ] */
RE_TOK_START_INV_CHAR_CLASS, /* [^ ] */
RE_TOK_EOF, /**< EOF */
RE_TOK_BACKREFERENCE, /**< \[0..9] */
RE_TOK_CHAR, /**< any character */
RE_TOK_ALTERNATIVE, /**< | */
RE_TOK_ASSERT_START, /**< ^ */
RE_TOK_ASSERT_END, /**< $ */
RE_TOK_PERIOD, /**< . */
RE_TOK_START_CAPTURE_GROUP, /**< ( */
RE_TOK_START_NON_CAPTURE_GROUP, /**< (?: */
RE_TOK_END_GROUP, /**< ')' */
RE_TOK_ASSERT_START_POS_LOOKAHEAD, /**< (?= */
RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /**< (?! */
RE_TOK_ASSERT_WORD_BOUNDARY, /**< \b */
RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /**< \B */
RE_TOK_DIGIT, /**< \d */
RE_TOK_NOT_DIGIT, /**< \D */
RE_TOK_WHITE, /**< \s */
RE_TOK_NOT_WHITE, /**< \S */
RE_TOK_WORD_CHAR, /**< \w */
RE_TOK_NOT_WORD_CHAR, /**< \W */
RE_TOK_START_CHAR_CLASS, /**< [ ] */
RE_TOK_START_INV_CHAR_CLASS, /**< [^ ] */
} re_token_type_t;
/**
@ -68,7 +68,7 @@ typedef enum
/**
* RegExp constant of infinite
*/
#define RE_ITERATOR_INFINITE ((uint32_t)-1)
#define RE_ITERATOR_INFINITE ((uint32_t) - 1)
/**
* Maximum number of decimal escape digits

View File

@ -295,9 +295,9 @@ vm_construct_literal_object (vm_frame_ctx_t *frame_ctx_p, /**< frame context */
}
return ret_value;
#else
#else /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
JERRY_UNIMPLEMENTED ("Regular Expressions are not supported in compact profile!");
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
#endif /* !CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */
}
} /* vm_construct_literal_object */