jerryscript/jerry-core/ecma/operations/ecma-regexp-object.c
Dániel Bátyai c3bb516e4a Refactor RegExp builtin (#3136)
JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
2019-10-02 16:55:16 +02:00

1361 lines
43 KiB
C

/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "ecma-alloc.h"
#include "ecma-array-object.h"
#include "ecma-builtin-helpers.h"
#include "ecma-exceptions.h"
#include "ecma-gc.h"
#include "ecma-globals.h"
#include "ecma-objects.h"
#include "ecma-regexp-object.h"
#include "ecma-try-catch-macro.h"
#include "jcontext.h"
#include "jrt-libc-includes.h"
#include "lit-char-helpers.h"
#include "re-compiler.h"
#if ENABLED (JERRY_BUILTIN_REGEXP)
#define ECMA_BUILTINS_INTERNAL
#include "ecma-builtins-internal.h"
/** \addtogroup ecma ECMA
* @{
*
* \addtogroup ecmaregexpobject ECMA RegExp object related routines
* @{
*/
/**
* Index of the global capturing group
*/
#define RE_GLOBAL_CAPTURE 0
/**
* Check if a RegExp opcode is a capture group or not
*/
#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
/**
* Parse RegExp flags (global, ignoreCase, multiline)
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return empty ecma value - if parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
ecma_regexp_parse_flags (ecma_string_t *flags_str_p, /**< Input string with flags */
uint16_t *flags_p) /**< [out] parsed flag bits */
{
ecma_value_t ret_value = ECMA_VALUE_EMPTY;
ECMA_STRING_TO_UTF8_STRING (flags_str_p, flags_start_p, flags_start_size);
const lit_utf8_byte_t *flags_str_curr_p = flags_start_p;
const lit_utf8_byte_t *flags_str_end_p = flags_start_p + flags_start_size;
while (flags_str_curr_p < flags_str_end_p
&& ecma_is_value_empty (ret_value))
{
switch (*flags_str_curr_p++)
{
case 'g':
{
if (*flags_p & RE_FLAG_GLOBAL)
{
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp flags."));
}
*flags_p |= RE_FLAG_GLOBAL;
break;
}
case 'i':
{
if (*flags_p & RE_FLAG_IGNORE_CASE)
{
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp flags."));
}
*flags_p |= RE_FLAG_IGNORE_CASE;
break;
}
case 'm':
{
if (*flags_p & RE_FLAG_MULTILINE)
{
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp flags."));
}
*flags_p |= RE_FLAG_MULTILINE;
break;
}
default:
{
ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Invalid RegExp flags."));
break;
}
}
}
ECMA_FINALIZE_UTF8_STRING (flags_start_p, flags_start_size);
return ret_value;
} /* ecma_regexp_parse_flags */
/*
* Create the properties of a RegExp instance.
*/
static void
ecma_regexp_create_props (ecma_object_t *re_object_p) /**< RegExp object */
{
#if !ENABLED (JERRY_ES2015)
ecma_create_named_data_property (re_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_SOURCE),
ECMA_PROPERTY_FIXED,
NULL);
ecma_create_named_data_property (re_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_GLOBAL),
ECMA_PROPERTY_FIXED,
NULL);
ecma_create_named_data_property (re_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_IGNORECASE_UL),
ECMA_PROPERTY_FIXED,
NULL);
ecma_create_named_data_property (re_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_MULTILINE),
ECMA_PROPERTY_FIXED,
NULL);
#endif /* !ENABLED (JERRY_ES2015) */
ecma_create_named_data_property (re_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ECMA_PROPERTY_FLAG_WRITABLE,
NULL);
} /* ecma_regexp_create_props */
/*
* Helper function to assign a value to a property
*/
static void
ecma_regexp_helper_assign_prop (ecma_object_t *re_object_p, /**< RegExp object */
lit_magic_string_id_t prop_id, /**< property name ide */
ecma_value_t value) /**< value */
{
ecma_property_ref_t property_ref;
ecma_op_object_get_own_property (re_object_p,
ecma_get_magic_string (prop_id),
&property_ref,
ECMA_PROPERTY_GET_VALUE);
ecma_named_data_property_assign_value (re_object_p,
property_ref.value_p,
value);
} /* ecma_regexp_helper_assign_prop */
/**
* Initializes the properties of a RegExp instance.
*/
void
ecma_regexp_initialize_props (ecma_object_t *re_object_p, /**< RegExp object */
ecma_string_t *source_p, /**< source string */
uint16_t flags) /**< flags */
{
#if !ENABLED (JERRY_ES2015)
ecma_regexp_helper_assign_prop (re_object_p,
LIT_MAGIC_STRING_SOURCE,
ecma_make_string_value (source_p));
ecma_regexp_helper_assign_prop (re_object_p,
LIT_MAGIC_STRING_GLOBAL,
ecma_make_boolean_value (flags & RE_FLAG_GLOBAL));
ecma_regexp_helper_assign_prop (re_object_p,
LIT_MAGIC_STRING_IGNORECASE_UL,
ecma_make_boolean_value (flags & RE_FLAG_IGNORE_CASE));
ecma_regexp_helper_assign_prop (re_object_p,
LIT_MAGIC_STRING_MULTILINE,
ecma_make_boolean_value (flags & RE_FLAG_MULTILINE));
#else /* ENABLED (JERRY_ES2015) */
JERRY_UNUSED (source_p);
JERRY_UNUSED (flags);
#endif /* !ENABLED (JERRY_ES2015) */
ecma_regexp_helper_assign_prop (re_object_p,
LIT_MAGIC_STRING_LASTINDEX_UL,
ecma_make_uint32_value (0));
} /* ecma_regexp_initialize_props */
/**
* RegExp object creation operation.
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return constructed RegExp object
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p) /**< RegExp bytecode */
{
JERRY_ASSERT (bytecode_p != NULL);
ecma_object_t *re_prototype_obj_p = ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE);
ecma_object_t *object_p = ecma_create_object (re_prototype_obj_p,
sizeof (ecma_extended_object_t),
ECMA_OBJECT_TYPE_CLASS);
ecma_extended_object_t *ext_object_p = (ecma_extended_object_t *) object_p;
/* Set the internal [[Class]] property */
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;
/* Set bytecode internal property. */
ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bytecode_p);
ecma_bytecode_ref ((ecma_compiled_code_t *) bytecode_p);
/* Create and initialize RegExp object properties */
ecma_regexp_create_props (object_p);
ecma_regexp_initialize_props (object_p,
ecma_get_string_from_value (bytecode_p->source),
bytecode_p->header.status_flags);
return ecma_make_object_value (object_p);
} /* ecma_op_create_regexp_object_from_bytecode */
/**
* RegExp object creation operation.
*
* See also: ECMA-262 v5, 15.10.4.1
*
* @return constructed RegExp object - if pattern and flags were parsed successfully
* error ecma value - otherwise
*
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */
uint16_t flags) /**< flags */
{
JERRY_ASSERT (pattern_p != NULL);
ecma_object_t *re_prototype_obj_p = ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE);
ecma_object_t *object_p = ecma_create_object (re_prototype_obj_p,
sizeof (ecma_extended_object_t),
ECMA_OBJECT_TYPE_CLASS);
ecma_extended_object_t *ext_object_p = (ecma_extended_object_t *) object_p;
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_UNDEFINED;
ecma_regexp_create_props (object_p);
ecma_regexp_initialize_props (object_p, pattern_p, flags);
/* Compile bytecode. */
const re_compiled_code_t *bc_p = NULL;
ecma_value_t ret_value = re_compile_bytecode (&bc_p, pattern_p, flags);
if (ECMA_IS_VALUE_ERROR (ret_value))
{
ecma_deref_object (object_p);
return ret_value;
}
JERRY_ASSERT (ecma_is_value_empty (ret_value));
/* Set [[Class]] and bytecode internal properties. */
ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_REGEXP_UL;
ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bc_p);
return ecma_make_object_value (object_p);
} /* ecma_op_create_regexp_object */
/**
* Canonicalize a character
*
* @return ecma_char_t canonicalized character
*/
ecma_char_t
ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */
{
if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
{
if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z)
{
return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
}
return ch;
}
ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH];
const ecma_length_t size = lit_char_to_upper_case (ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH);
/* 3. */
if (size != 1)
{
return ch;
}
/* 4. */
const ecma_char_t cu = u[0];
/* 5. */
if (cu >= 128)
{
/* 6. */
return cu;
}
return ch;
} /* ecma_regexp_canonicalize_char */
/**
* RegExp Canonicalize abstract operation
*
* See also: ECMA-262 v5, 15.10.2.8
*
* @return ecma_char_t canonicalized character
*/
inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE
ecma_regexp_canonicalize (ecma_char_t ch, /**< character */
bool is_ignorecase) /**< IgnoreCase flag */
{
if (is_ignorecase)
{
return ecma_regexp_canonicalize_char (ch);
}
return ch;
} /* ecma_regexp_canonicalize */
/**
* Recursive function for RegExp matching.
*
* See also:
* ECMA-262 v5, 15.10.2.1
*
* @return true - if matched
* false - otherwise
*/
static const lit_utf8_byte_t *
ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */
const lit_utf8_byte_t *str_curr_p) /**< input string pointer */
{
#if (JERRY_STACK_LIMIT != 0)
if (JERRY_UNLIKELY (ecma_get_current_stack_usage () > CONFIG_MEM_STACK_LIMIT))
{
return ECMA_RE_OUT_OF_STACK;
}
#endif /* JERRY_STACK_LIMIT != 0 */
while (true)
{
re_opcode_t op = re_get_opcode (&bc_p);
switch (op)
{
case RE_OP_MATCH:
{
JERRY_TRACE_MSG ("Execute RE_OP_MATCH: match\n");
return str_curr_p;
}
case RE_OP_CHAR:
{
if (str_curr_p >= re_ctx_p->input_end_p)
{
return NULL; /* fail */
}
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
ecma_char_t ch1 = (ecma_char_t) re_get_char (&bc_p); /* Already canonicalized. */
ecma_char_t ch2 = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase);
JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2);
if (ch1 != ch2)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
case RE_OP_PERIOD:
{
if (str_curr_p >= re_ctx_p->input_end_p)
{
return NULL; /* fail */
}
const ecma_char_t ch = lit_utf8_read_next (&str_curr_p);
JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch);
if (lit_char_is_line_terminator (ch))
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
case RE_OP_ASSERT_START:
{
JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_START: ");
if (str_curr_p <= re_ctx_p->input_start_p)
{
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
if (!(re_ctx_p->flags & RE_FLAG_MULTILINE))
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
if (lit_char_is_line_terminator (lit_utf8_peek_prev (str_curr_p)))
{
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
case RE_OP_ASSERT_END:
{
JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_END: ");
if (str_curr_p >= re_ctx_p->input_end_p)
{
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
if (!(re_ctx_p->flags & RE_FLAG_MULTILINE))
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
if (lit_char_is_line_terminator (lit_utf8_peek_next (str_curr_p)))
{
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
case RE_OP_ASSERT_WORD_BOUNDARY:
case RE_OP_ASSERT_NOT_WORD_BOUNDARY:
{
const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p)
&& lit_char_is_word_char (lit_utf8_peek_prev (str_curr_p)));
const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p)
&& lit_char_is_word_char (lit_utf8_peek_next (str_curr_p)));
if (op == RE_OP_ASSERT_WORD_BOUNDARY)
{
JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_WORD_BOUNDARY: ");
if (is_wordchar_left == is_wordchar_right)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
}
else
{
JERRY_ASSERT (op == RE_OP_ASSERT_NOT_WORD_BOUNDARY);
JERRY_TRACE_MSG ("Execute RE_OP_ASSERT_NOT_WORD_BOUNDARY: ");
if (is_wordchar_left != is_wordchar_right)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
}
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
case RE_OP_LOOKAHEAD_POS:
case RE_OP_LOOKAHEAD_NEG:
{
const lit_utf8_byte_t *matched_p = NULL;
const size_t captures_size = re_ctx_p->captures_count * sizeof (ecma_regexp_capture_t);
ecma_regexp_capture_t *saved_captures_p = (ecma_regexp_capture_t *) jmem_heap_alloc_block (captures_size);
memcpy (saved_captures_p, re_ctx_p->captures_p, captures_size);
do
{
const uint32_t offset = re_get_value (&bc_p);
if (matched_p == NULL)
{
matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
jmem_heap_free_block (saved_captures_p, captures_size);
return matched_p;
}
}
bc_p += offset;
}
while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
JERRY_TRACE_MSG ("Execute RE_OP_LOOKAHEAD_POS/NEG: ");
if ((op == RE_OP_LOOKAHEAD_POS && matched_p != NULL)
|| (op == RE_OP_LOOKAHEAD_NEG && matched_p == NULL))
{
JERRY_TRACE_MSG ("match\n");
matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
}
else
{
JERRY_TRACE_MSG ("fail\n");
matched_p = NULL; /* fail */
}
if (matched_p == NULL)
{
/* restore saved */
memcpy (re_ctx_p->captures_p, saved_captures_p, captures_size);
}
jmem_heap_free_block (saved_captures_p, captures_size);
return matched_p;
}
case RE_OP_CHAR_CLASS:
case RE_OP_INV_CHAR_CLASS:
{
JERRY_TRACE_MSG ("Execute RE_OP_CHAR_CLASS/RE_OP_INV_CHAR_CLASS, ");
if (str_curr_p >= re_ctx_p->input_end_p)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
const ecma_char_t curr_ch = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase);
uint32_t range_count = re_get_value (&bc_p);
bool is_match = false;
while (range_count-- > 0)
{
const ecma_char_t ch1 = re_get_char (&bc_p);
if (curr_ch < ch1)
{
bc_p += sizeof (ecma_char_t);
continue;
}
const ecma_char_t ch2 = re_get_char (&bc_p);
is_match = (curr_ch <= ch2);
if (is_match)
{
/* Skip the remaining ranges in the bytecode. */
bc_p += range_count * 2 * sizeof (ecma_char_t);
break;
}
}
JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS);
if ((op == RE_OP_CHAR_CLASS) != is_match)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
case RE_OP_BACKREFERENCE:
{
const uint32_t backref_idx = re_get_value (&bc_p);
JERRY_TRACE_MSG ("Execute RE_OP_BACKREFERENCE (idx: %u): ", (unsigned int) backref_idx);
JERRY_ASSERT (backref_idx >= 1 && backref_idx < re_ctx_p->captures_count);
const ecma_regexp_capture_t capture = re_ctx_p->captures_p[backref_idx];
if (capture.begin_p == NULL || capture.end_p == NULL)
{
JERRY_TRACE_MSG ("match\n");
break; /* capture is 'undefined', always matches! */
}
const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p);
if (str_curr_p + capture_size > re_ctx_p->input_end_p)
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
if (memcmp (str_curr_p, capture.begin_p, capture_size))
{
JERRY_TRACE_MSG ("fail\n");
return NULL; /* fail */
}
str_curr_p += capture_size;
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
case RE_OP_SAVE_AT_START:
{
JERRY_TRACE_MSG ("Execute RE_OP_SAVE_AT_START\n");
re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p;
do
{
const uint32_t offset = re_get_value (&bc_p);
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
bc_p += offset;
}
while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
bc_p -= sizeof (uint8_t);
return NULL; /* fail */
}
case RE_OP_SAVE_AND_MATCH:
{
JERRY_TRACE_MSG ("End of pattern is reached: match\n");
re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p;
return str_curr_p; /* match */
}
case RE_OP_ALTERNATIVE:
{
/*
* Alternatives should be jumped over, when an alternative opcode appears.
*/
uint32_t offset = re_get_value (&bc_p);
JERRY_TRACE_MSG ("Execute RE_OP_ALTERNATIVE");
bc_p += offset;
while (*bc_p == RE_OP_ALTERNATIVE)
{
JERRY_TRACE_MSG (", jump: %u", (unsigned int) offset);
bc_p++;
offset = re_get_value (&bc_p);
bc_p += offset;
}
JERRY_TRACE_MSG ("\n");
break; /* tail merge */
}
case RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
case RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START:
{
/*
* On non-greedy iterations we have to execute the bytecode
* after the group first, if zero iteration is allowed.
*/
const lit_utf8_byte_t *old_begin_p = NULL;
const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group start */
const uint32_t start_idx = re_get_value (&bc_p);
const uint32_t offset = re_get_value (&bc_p);
uint32_t *iterator_p;
if (RE_IS_CAPTURE_GROUP (op))
{
JERRY_ASSERT (start_idx < re_ctx_p->captures_count);
re_ctx_p->captures_p[start_idx].begin_p = str_curr_p;
iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]);
}
else
{
JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count);
iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]);
}
*iterator_p = 0;
/* Jump all over to the end of the END opcode. */
bc_p += offset;
/* Try to match after the close paren if zero is allowed */
const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return str_curr_p; /* match */
}
if (RE_IS_CAPTURE_GROUP (op))
{
re_ctx_p->captures_p[start_idx].begin_p = old_begin_p;
}
bc_p = bc_start_p;
/* FALLTHRU */
}
case RE_OP_CAPTURE_GROUP_START:
case RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START:
case RE_OP_NON_CAPTURE_GROUP_START:
case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START:
{
const uint8_t *bc_end_p = NULL;
const uint32_t start_idx = re_get_value (&bc_p);
if (op != RE_OP_CAPTURE_GROUP_START
&& op != RE_OP_NON_CAPTURE_GROUP_START)
{
const uint32_t offset = re_get_value (&bc_p);
bc_end_p = bc_p + offset;
}
const lit_utf8_byte_t **group_begin_p;
uint32_t *iterator_p;
if (RE_IS_CAPTURE_GROUP (op))
{
JERRY_ASSERT (start_idx < re_ctx_p->captures_count);
group_begin_p = &(re_ctx_p->captures_p[start_idx].begin_p);
iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]);
}
else
{
JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count);
group_begin_p = &(re_ctx_p->non_captures_p[start_idx].str_p);
iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]);
}
const lit_utf8_byte_t *const old_begin_p = *group_begin_p;
const uint32_t old_iter_count = *iterator_p;
*group_begin_p = str_curr_p;
*iterator_p = 0;
do
{
const uint32_t offset = re_get_value (&bc_p);
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
bc_p += offset;
}
while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
bc_p -= sizeof (uint8_t);
*iterator_p = old_iter_count;
/* Try to match after the close paren if zero is allowed. */
if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START
|| op == RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START)
{
JERRY_ASSERT (bc_end_p);
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_end_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
}
*group_begin_p = old_begin_p;
return NULL; /* fail */
}
case RE_OP_CAPTURE_NON_GREEDY_GROUP_END:
case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END:
{
/*
* On non-greedy iterations we have to execute the bytecode
* after the group first. Try to iterate only if it fails.
*/
const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group end */
const uint32_t end_idx = re_get_value (&bc_p);
const uint32_t min = re_get_value (&bc_p);
const uint32_t max = re_get_value (&bc_p);
re_get_value (&bc_p); /* start offset */
const lit_utf8_byte_t **group_end_p;
uint32_t *iterator_p;
if (RE_IS_CAPTURE_GROUP (op))
{
JERRY_ASSERT (end_idx < re_ctx_p->captures_count);
group_end_p = &(re_ctx_p->captures_p[end_idx].end_p);
iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]);
}
else
{
JERRY_ASSERT (end_idx < re_ctx_p->non_captures_count);
group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]);
}
(*iterator_p)++;
if (*iterator_p >= min && *iterator_p <= max)
{
const lit_utf8_byte_t *const old_end_p = *group_end_p;
*group_end_p = str_curr_p;
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
*group_end_p = old_end_p;
}
(*iterator_p)--;
bc_p = bc_start_p;
/* Non-greedy fails, try to iterate. */
/* FALLTHRU */
}
case RE_OP_CAPTURE_GREEDY_GROUP_END:
case RE_OP_NON_CAPTURE_GREEDY_GROUP_END:
{
const uint32_t end_idx = re_get_value (&bc_p);
const uint32_t min = re_get_value (&bc_p);
const uint32_t max = re_get_value (&bc_p);
uint32_t offset = re_get_value (&bc_p);
const lit_utf8_byte_t **group_begin_p;
const lit_utf8_byte_t **group_end_p;
uint32_t *iterator_p;
if (RE_IS_CAPTURE_GROUP (op))
{
JERRY_ASSERT (end_idx < re_ctx_p->captures_count);
group_begin_p = &(re_ctx_p->captures_p[end_idx].begin_p);
group_end_p = &(re_ctx_p->captures_p[end_idx].end_p);
iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]);
}
else
{
JERRY_ASSERT (end_idx <= re_ctx_p->non_captures_count);
group_begin_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p);
iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]);
}
/* Check the empty iteration if the minimum number of iterations is reached. */
if (*iterator_p >= min && str_curr_p == *group_begin_p)
{
return NULL; /* fail */
}
(*iterator_p)++;
const uint8_t *const bc_start_p = bc_p; /* Save the bytecode end position of the END opcodes. */
const lit_utf8_byte_t *const old_end_p = *group_end_p;
*group_end_p = str_curr_p;
if (*iterator_p < max)
{
bc_p -= offset;
offset = re_get_value (&bc_p);
const lit_utf8_byte_t *const old_begin_p = *group_begin_p;
*group_begin_p = str_curr_p;
const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
/* Try to match alternatives if any. */
bc_p += offset;
while (*bc_p == RE_OP_ALTERNATIVE)
{
bc_p++; /* RE_OP_ALTERNATIVE */
offset = re_get_value (&bc_p);
*group_begin_p = str_curr_p;
matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
bc_p += offset;
}
*group_begin_p = old_begin_p;
}
if (*iterator_p >= min && *iterator_p <= max)
{
/* Try to match the rest of the bytecode. */
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_start_p, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
}
/* restore if fails */
*group_end_p = old_end_p;
(*iterator_p)--;
return NULL; /* fail */
}
case RE_OP_NON_GREEDY_ITERATOR:
{
const uint32_t min = re_get_value (&bc_p);
const uint32_t max = re_get_value (&bc_p);
const uint32_t offset = re_get_value (&bc_p);
JERRY_TRACE_MSG ("Non-greedy iterator, min=%lu, max=%lu, offset=%ld\n",
(unsigned long) min, (unsigned long) max, (long) offset);
uint32_t iter_count = 0;
while (iter_count <= max)
{
if (iter_count >= min)
{
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
}
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
return matched_p;
}
if (matched_p == NULL)
{
break;
}
str_curr_p = matched_p;
iter_count++;
}
return NULL; /* fail */
}
default:
{
JERRY_ASSERT (op == RE_OP_GREEDY_ITERATOR);
const uint32_t min = re_get_value (&bc_p);
const uint32_t max = re_get_value (&bc_p);
const uint32_t offset = re_get_value (&bc_p);
JERRY_TRACE_MSG ("Greedy iterator, min=%lu, max=%lu, offset=%ld\n",
(unsigned long) min, (unsigned long) max, (long) offset);
uint32_t iter_count = 0;
while (iter_count < max)
{
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p);
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
return matched_p;
}
if (matched_p == NULL)
{
break;
}
str_curr_p = matched_p;
iter_count++;
}
if (iter_count >= min)
{
while (true)
{
const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p);
if (matched_p != NULL)
{
return matched_p; /* match */
}
if (iter_count == min)
{
break;
}
lit_utf8_read_prev (&str_curr_p);
iter_count--;
}
}
return NULL; /* fail */
}
}
}
} /* ecma_regexp_match */
static ecma_value_t
ecma_regexp_create_result_object (ecma_regexp_ctx_t *re_ctx_p,
ecma_string_t *input_string_p,
uint32_t index)
{
ecma_value_t result_array = ecma_op_create_array_object (0, 0, false);
ecma_object_t *result_p = ecma_get_object_from_value (result_array);
for (uint32_t i = 0; i < re_ctx_p->captures_count; i++)
{
const ecma_regexp_capture_t capture = re_ctx_p->captures_p[i];
if (capture.begin_p != NULL && capture.end_p >= capture.begin_p)
{
const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p);
ecma_string_t *const capture_str_p = ecma_new_ecma_string_from_utf8 (capture.begin_p, capture_size);
const ecma_value_t capture_value = ecma_make_string_value (capture_str_p);
ecma_builtin_helper_def_prop_by_index (result_p,
i,
capture_value,
ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE);
ecma_deref_ecma_string (capture_str_p);
}
else
{
ecma_builtin_helper_def_prop_by_index (result_p,
i,
ECMA_VALUE_UNDEFINED,
ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE);
}
}
ecma_builtin_helper_def_prop (result_p,
ecma_get_magic_string (LIT_MAGIC_STRING_INDEX),
ecma_make_uint32_value (index),
ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE);
ecma_builtin_helper_def_prop (result_p,
ecma_get_magic_string (LIT_MAGIC_STRING_INPUT),
ecma_make_string_value (input_string_p),
ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE);
return result_array;
} /* ecma_regexp_create_result_object */
/**
* RegExp helper function to start the recursive matching algorithm
* and create the result Array object
*
* See also:
* ECMA-262 v5, 15.10.6.2
*
* @return array object - if matched
* null - otherwise
*
* May raise error.
* Returned value must be freed with ecma_free_value
*/
ecma_value_t
ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
ecma_value_t input_string, /**< input string */
bool ignore_global) /**< ignore global flag */
{
ecma_value_t ret_value = ECMA_VALUE_EMPTY;
JERRY_ASSERT (ecma_is_value_object (regexp_value));
JERRY_ASSERT (ecma_is_value_string (input_string));
ecma_object_t *regexp_object_p = ecma_get_object_from_value (regexp_value);
JERRY_ASSERT (ecma_object_class_is (regexp_object_p, LIT_MAGIC_STRING_REGEXP_UL));
ecma_extended_object_t *ext_object_p = (ecma_extended_object_t *) regexp_object_p;
re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t,
ext_object_p->u.class_prop.u.value);
ecma_regexp_ctx_t re_ctx;
ecma_string_t *input_string_p = ecma_get_string_from_value (input_string);
if (bc_p == NULL)
{
#if ENABLED (JERRY_ES2015)
return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type"));
#else /* !ENABLED (JERRY_ES2015) */
/* Missing bytecode means the RegExp object is the RegExp.prototype,
* which will always result in an empty string match. */
re_ctx.captures_count = 1;
re_ctx.captures_p = jmem_heap_alloc_block (sizeof (ecma_regexp_capture_t));
re_ctx.captures_p->begin_p = lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY);
re_ctx.captures_p->end_p = lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY);
ret_value = ecma_regexp_create_result_object (&re_ctx, input_string_p, 0);
jmem_heap_free_block (re_ctx.captures_p, sizeof (ecma_regexp_capture_t));
return ret_value;
#endif /* ENABLED (JERRY_ES2015) */
}
re_ctx.flags = bc_p->header.status_flags;
if (ignore_global)
{
re_ctx.flags &= (uint16_t) ~RE_FLAG_GLOBAL;
}
lit_utf8_size_t input_size;
lit_utf8_size_t input_length;
uint8_t input_flags = ECMA_STRING_FLAG_IS_ASCII;
const lit_utf8_byte_t *input_buffer_p = ecma_string_get_chars (input_string_p,
&input_size,
&input_length,
NULL,
&input_flags);
const lit_utf8_byte_t *input_curr_p = input_buffer_p;
uint32_t index = 0;
if (re_ctx.flags & RE_FLAG_GLOBAL)
{
ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL);
ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p);
ecma_number_t lastindex_num;
ret_value = ecma_get_number (lastindex_value, &lastindex_num);
ecma_free_value (lastindex_value);
if (ECMA_IS_VALUE_ERROR (ret_value))
{
goto cleanup_string;
}
/* TODO: Replace with ToLength */
if (lastindex_num < 0.0f)
{
#if ENABLED (JERRY_ES2015)
lastindex_num = 0.0f;
#else /* !ENABLED (JERRY_ES2015) */
lastindex_num = input_length + 1;
#endif /* ENABLED (JERRY_ES2015) */
}
index = ecma_number_to_uint32 (lastindex_num);
if (index > input_length)
{
ret_value = ecma_op_object_put (regexp_object_p,
lastindex_str_p,
ecma_make_integer_value (0),
true);
if (!ECMA_IS_VALUE_ERROR (ret_value))
{
JERRY_ASSERT (ecma_is_value_boolean (ret_value));
/* lastIndex is out of bounds, the match should fail. */
ret_value = ECMA_VALUE_NULL;
}
goto cleanup_string;
}
if (index > 0)
{
if (input_flags & ECMA_STRING_FLAG_IS_ASCII)
{
input_curr_p += index;
}
else
{
for (uint32_t i = 0; i < index; i++)
{
lit_utf8_incr (&input_curr_p);
}
}
}
}
re_ctx.input_start_p = input_buffer_p;
const lit_utf8_byte_t *input_end_p = re_ctx.input_start_p + input_size;
re_ctx.input_end_p = input_end_p;
JERRY_TRACE_MSG ("Exec with flags [global: %d, ignoreCase: %d, multiline: %d]\n",
re_ctx.flags & RE_FLAG_GLOBAL,
re_ctx.flags & RE_FLAG_IGNORE_CASE,
re_ctx.flags & RE_FLAG_MULTILINE);
re_ctx.captures_count = bc_p->captures_count;
re_ctx.captures_p = jmem_heap_alloc_block (re_ctx.captures_count * sizeof (ecma_regexp_capture_t));
memset (re_ctx.captures_p, 0, re_ctx.captures_count * sizeof (ecma_regexp_capture_t));
re_ctx.non_captures_count = bc_p->non_captures_count;
re_ctx.non_captures_p = jmem_heap_alloc_block (re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t));
memset (re_ctx.non_captures_p, 0, re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t));
const uint32_t iters_length = re_ctx.captures_count + re_ctx.non_captures_count - 1;
re_ctx.iterations_p = jmem_heap_alloc_block (iters_length * sizeof (uint32_t));
memset (re_ctx.iterations_p, 0, iters_length * sizeof (uint32_t));
/* 2. Try to match */
uint8_t *bc_start_p = (uint8_t *) (bc_p + 1);
const lit_utf8_byte_t *matched_p = NULL;
JERRY_ASSERT (index <= input_length);
while (true)
{
matched_p = ecma_regexp_match (&re_ctx, bc_start_p, input_curr_p);
if (matched_p != NULL)
{
break;
}
index++;
if (index > input_length)
{
if (re_ctx.flags & RE_FLAG_GLOBAL)
{
ecma_value_t put_result = ecma_op_object_put (regexp_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (0),
true);
if (ECMA_IS_VALUE_ERROR (put_result))
{
ret_value = put_result;
goto cleanup_context;
}
JERRY_ASSERT (ecma_is_value_boolean (put_result));
}
/* Failed to match, return 'null'. */
ret_value = ECMA_VALUE_NULL;
goto cleanup_context;
}
JERRY_ASSERT (input_curr_p < input_end_p);
lit_utf8_incr (&input_curr_p);
}
JERRY_ASSERT (matched_p != NULL);
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
ret_value = ecma_raise_range_error (ECMA_ERR_MSG ("Stack limit exceeded."));
goto cleanup_context;
}
if (re_ctx.flags & RE_FLAG_GLOBAL)
{
JERRY_ASSERT (index <= input_length);
lit_utf8_size_t match_length;
const lit_utf8_byte_t *match_begin_p = re_ctx.captures_p[0].begin_p;
const lit_utf8_byte_t *match_end_p = re_ctx.captures_p[0].end_p;
if (input_flags & ECMA_STRING_FLAG_IS_ASCII)
{
match_length = (lit_utf8_size_t) (match_end_p - match_begin_p);
}
else
{
match_length = lit_utf8_string_length (match_begin_p,
(lit_utf8_size_t) (match_end_p - match_begin_p));
}
ecma_value_t put_result = ecma_op_object_put (regexp_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (index + match_length),
true);
if (ECMA_IS_VALUE_ERROR (put_result))
{
ret_value = put_result;
goto cleanup_context;
}
JERRY_ASSERT (ecma_is_value_boolean (put_result));
}
ret_value = ecma_regexp_create_result_object (&re_ctx, input_string_p, index);
cleanup_context:
jmem_heap_free_block (re_ctx.captures_p, re_ctx.captures_count * sizeof (ecma_regexp_capture_t));
if (re_ctx.non_captures_p != NULL)
{
jmem_heap_free_block (re_ctx.non_captures_p, re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t));
}
if (re_ctx.iterations_p != NULL)
{
jmem_heap_free_block (re_ctx.iterations_p, iters_length * sizeof (uint32_t));
}
cleanup_string:
if (input_flags & ECMA_STRING_FLAG_MUST_BE_FREED)
{
jmem_heap_free_block ((void *) input_buffer_p, input_size);
}
return ret_value;
} /* ecma_regexp_exec_helper */
/**
* Helper function for converting a RegExp pattern parameter to string.
*
* See also:
* RegExp.compile
* RegExp dispatch call
*
* @return empty value if success, error value otherwise
* Returned value must be freed with ecma_free_value.
*/
ecma_value_t
ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg, /**< the RegExp pattern */
ecma_string_t **pattern_string_p) /**< [out] ptr to the pattern string ptr */
{
if (!ecma_is_value_undefined (pattern_arg))
{
ecma_value_t regexp_str_value = ecma_op_to_string (pattern_arg);
if (ECMA_IS_VALUE_ERROR (regexp_str_value))
{
return regexp_str_value;
}
*pattern_string_p = ecma_get_string_from_value (regexp_str_value);
if (!ecma_string_is_empty (*pattern_string_p))
{
ecma_ref_ecma_string (*pattern_string_p);
}
ecma_free_value (regexp_str_value); // must be freed *after* ecma_ref_ecma_string
}
if (!*pattern_string_p || ecma_string_is_empty (*pattern_string_p))
{
*pattern_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP);
}
return ECMA_VALUE_EMPTY;
} /* ecma_regexp_read_pattern_str_helper */
/**
* @}
* @}
*/
#endif /* ENABLED (JERRY_BUILTIN_REGEXP) */