diff --git a/jerry-core/api/jerry-snapshot.c b/jerry-core/api/jerry-snapshot.c index 1fe83335d..a343c66df 100644 --- a/jerry-core/api/jerry-snapshot.c +++ b/jerry-core/api/jerry-snapshot.c @@ -179,7 +179,7 @@ snapshot_add_compiled_code (ecma_compiled_code_t *compiled_code_p, /**< compiled globals_p->snapshot_buffer_write_offset += sizeof (ecma_compiled_code_t); - ecma_value_t pattern = ((re_compiled_code_t *) compiled_code_p)->pattern; + ecma_value_t pattern = ((re_compiled_code_t *) compiled_code_p)->source; ecma_string_t *pattern_string_p = ecma_get_string_from_value (pattern); ecma_length_t pattern_size = 0; diff --git a/jerry-core/ecma/base/ecma-helpers.c b/jerry-core/ecma/base/ecma-helpers.c index f7d359670..2d5470738 100644 --- a/jerry-core/ecma/base/ecma-helpers.c +++ b/jerry-core/ecma/base/ecma-helpers.c @@ -1403,7 +1403,7 @@ ecma_bytecode_deref (ecma_compiled_code_t *bytecode_p) /**< byte code pointer */ #if ENABLED (JERRY_BUILTIN_REGEXP) re_compiled_code_t *re_bytecode_p = (re_compiled_code_t *) bytecode_p; - ecma_deref_ecma_string (ecma_get_string_from_value (re_bytecode_p->pattern)); + ecma_deref_ecma_string (ecma_get_string_from_value (re_bytecode_p->source)); #endif /* ENABLED (JERRY_BUILTIN_REGEXP) */ } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.c b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.c index fee427578..74de58d72 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.c @@ -45,6 +45,185 @@ * @{ */ +#if ENABLED (JERRY_ES2015) +/** + * Helper function to retrieve the flags associated with a RegExp object + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * ECMA_VALUE_EMPTY - otherwise + */ +static ecma_value_t +ecma_builtin_regexp_prototype_flags_helper (ecma_value_t this, /**< this value */ + uint16_t *flags_p) /**< [out] flags */ +{ + if (!ecma_is_value_object (this) + || !ecma_object_class_is (ecma_get_object_from_value (this), LIT_MAGIC_STRING_REGEXP_UL)) + { + return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type")); + } + + ecma_extended_object_t *re_obj_p = (ecma_extended_object_t *) ecma_get_object_from_value (this); + re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, + re_obj_p->u.class_prop.u.value); + + if (bc_p != NULL) + { + *flags_p = bc_p->header.status_flags; + } + + return ECMA_VALUE_EMPTY; +} /* ecma_builtin_regexp_prototype_flags_helper */ + +/** + * The RegExp.prototype object's 'flags' accessor property + * + * See also: + * ECMA-262 v6, 21.2.5.3 + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * string value - otherwise + * + * Returned value must be freed with ecma_free_value. + */ +static ecma_value_t +ecma_builtin_regexp_prototype_get_flags (ecma_value_t this_arg) /**< this argument */ +{ + uint16_t flags = RE_FLAG_EMPTY; + ecma_value_t ret_value = ecma_builtin_regexp_prototype_flags_helper (this_arg, &flags); + if (ECMA_IS_VALUE_ERROR (ret_value)) + { + return ret_value; + } + + ecma_stringbuilder_t result = ecma_stringbuilder_create (); + + if (flags & RE_FLAG_GLOBAL) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_G); + } + + if (flags & RE_FLAG_IGNORE_CASE) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_I); + } + + if (flags & RE_FLAG_MULTILINE) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_M); + } + + return ecma_make_string_value (ecma_stringbuilder_finalize (&result)); +} /* ecma_builtin_regexp_prototype_get_flags */ + +/** + * The RegExp.prototype object's 'source' accessor property + * + * See also: + * ECMA-262 v6, 21.2.5.10 + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * string value - otherwise + * + * Returned value must be freed with ecma_free_value. + */ +static ecma_value_t +ecma_builtin_regexp_prototype_get_source (ecma_value_t this_arg) /**< this argument */ +{ + if (!ecma_is_value_object (this_arg) + || !ecma_object_class_is (ecma_get_object_from_value (this_arg), LIT_MAGIC_STRING_REGEXP_UL)) + { + return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type")); + } + + ecma_extended_object_t *re_obj_p = (ecma_extended_object_t *) ecma_get_object_from_value (this_arg); + re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, + re_obj_p->u.class_prop.u.value); + + if (bc_p != NULL) + { + ecma_ref_ecma_string (ecma_get_string_from_value (bc_p->source)); + return bc_p->source; + } + + return ecma_make_string_value (ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP)); +} /* ecma_builtin_regexp_prototype_get_source */ + +/** + * The RegExp.prototype object's 'global' accessor property + * + * See also: + * ECMA-262 v6, 21.2.5.4 + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * ECMA_VALUE_TRUE - if 'global' flag is set + * ECMA_VALUE_FALSE - otherwise + * + * Returned value must be freed with ecma_free_value. + */ +static ecma_value_t +ecma_builtin_regexp_prototype_get_global (ecma_value_t this_arg) /**< this argument */ +{ + uint16_t flags = RE_FLAG_EMPTY; + ecma_value_t ret_value = ecma_builtin_regexp_prototype_flags_helper (this_arg, &flags); + if (ECMA_IS_VALUE_ERROR (ret_value)) + { + return ret_value; + } + + return ecma_make_boolean_value (flags & RE_FLAG_GLOBAL); +} /* ecma_builtin_regexp_prototype_get_global */ + +/** + * The RegExp.prototype object's 'ignoreCase' accessor property + * + * See also: + * ECMA-262 v6, 21.2.5.5 + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * ECMA_VALUE_TRUE - if 'ignoreCase' flag is set + * ECMA_VALUE_FALSE - otherwise + * + * Returned value must be freed with ecma_free_value. + */ +static ecma_value_t +ecma_builtin_regexp_prototype_get_ignorecase (ecma_value_t this_arg) /**< this argument */ +{ + uint16_t flags = RE_FLAG_EMPTY; + ecma_value_t ret_value = ecma_builtin_regexp_prototype_flags_helper (this_arg, &flags); + if (ECMA_IS_VALUE_ERROR (ret_value)) + { + return ret_value; + } + + return ecma_make_boolean_value (flags & RE_FLAG_IGNORE_CASE); +} /* ecma_builtin_regexp_prototype_get_ignorecase */ + +/** + * The RegExp.prototype object's 'multiline' accessor property + * + * See also: + * ECMA-262 v6, 21.2.5.7 + * + * @return ECMA_VALUE_ERROR - if 'this' is not a RegExp object + * ECMA_VALUE_TRUE - if 'multiline' flag is set + * ECMA_VALUE_FALSE - otherwise + * + * Returned value must be freed with ecma_free_value. + */ +static ecma_value_t +ecma_builtin_regexp_prototype_get_multiline (ecma_value_t this_arg) /**< this argument */ +{ + uint16_t flags = RE_FLAG_EMPTY; + ecma_value_t ret_value = ecma_builtin_regexp_prototype_flags_helper (this_arg, &flags); + if (ECMA_IS_VALUE_ERROR (ret_value)) + { + return ret_value; + } + + return ecma_make_boolean_value (flags & RE_FLAG_MULTILINE); +} /* ecma_builtin_regexp_prototype_get_multiline */ +#endif /* ENABLED (JERRY_ES2015) */ + #if ENABLED (JERRY_BUILTIN_ANNEXB) /** @@ -68,89 +247,40 @@ ecma_builtin_regexp_prototype_compile (ecma_value_t this_arg, /**< this argument /* The builtin RegExp.prototype object does not have [[RegExpMatcher]] internal slot */ || ecma_get_object_from_value (this_arg) == ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE)) { - return ecma_raise_type_error (ECMA_ERR_MSG ("Incomplete RegExp type")); + return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type")); } uint16_t flags = 0; if (ecma_is_value_object (pattern_arg) - && ecma_object_class_is (ecma_get_object_from_value (pattern_arg), LIT_MAGIC_STRING_REGEXP_UL)) + && ecma_object_class_is (ecma_get_object_from_value (pattern_arg), LIT_MAGIC_STRING_REGEXP_UL) + && ecma_get_object_from_value (pattern_arg) != ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE)) { if (!ecma_is_value_undefined (flags_arg)) { - return ecma_raise_type_error (ECMA_ERR_MSG ("Invalid argument of RegExp compile.")); - } - /* Compile from existing RegExp pbject. */ - ecma_object_t *target_p = ecma_get_object_from_value (pattern_arg); - - /* Get source. */ - ecma_string_t *magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_SOURCE); - ecma_value_t source_value = ecma_op_object_get_own_data_prop (target_p, magic_string_p); - ecma_string_t *pattern_string_p = ecma_get_string_from_value (source_value); - - /* Get flags. */ - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_GLOBAL); - ecma_value_t global_value = ecma_op_object_get_own_data_prop (target_p, magic_string_p); - - JERRY_ASSERT (ecma_is_value_boolean (global_value)); - - if (ecma_is_value_true (global_value)) - { - flags |= RE_FLAG_GLOBAL; + return ecma_raise_type_error (ECMA_ERR_MSG ("Invalid argument")); } - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_IGNORECASE_UL); - ecma_value_t ignore_case_value = ecma_op_object_get_own_data_prop (target_p, magic_string_p); + /* Compile from existing RegExp object. */ + ecma_extended_object_t *target_p = (ecma_extended_object_t *) ecma_get_object_from_value (pattern_arg); + re_compiled_code_t *target_bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, + target_p->u.class_prop.u.value); - JERRY_ASSERT (ecma_is_value_boolean (ignore_case_value)); + ecma_object_t *this_object_p = ecma_get_object_from_value (this_arg); + ecma_extended_object_t *current_p = (ecma_extended_object_t *) this_object_p; - if (ecma_is_value_true (ignore_case_value)) - { - flags |= RE_FLAG_IGNORE_CASE; - } + re_compiled_code_t *current_bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, + current_p->u.class_prop.u.value); - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_MULTILINE); - ecma_value_t multiline_value = ecma_op_object_get_own_data_prop (target_p, magic_string_p); - - JERRY_ASSERT (ecma_is_value_boolean (multiline_value)); - - if (ecma_is_value_true (multiline_value)) - { - flags |= RE_FLAG_MULTILINE; - } - - ecma_value_t obj_this = ecma_op_to_object (this_arg); - if (ECMA_IS_VALUE_ERROR (obj_this)) - { - return obj_this; - } - ecma_object_t *this_obj_p = ecma_get_object_from_value (obj_this); - - /* Get bytecode property. */ - ecma_value_t *bc_prop_p = &(((ecma_extended_object_t *) this_obj_p)->u.class_prop.u.value); - - /* TODO: We currently have to re-compile the bytecode, because - * we can't copy it without knowing its length. */ - const re_compiled_code_t *new_bc_p = NULL; - ecma_value_t bc_comp = re_compile_bytecode (&new_bc_p, pattern_string_p, flags); - /* Should always succeed, since we're compiling from a source that has been compiled previously. */ - JERRY_ASSERT (ecma_is_value_empty (bc_comp)); - - ecma_deref_ecma_string (pattern_string_p); - - re_compiled_code_t *old_bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, *bc_prop_p); - - if (old_bc_p != NULL) - { - /* Free the old bytecode */ - ecma_bytecode_deref ((ecma_compiled_code_t *) old_bc_p); - } - - ECMA_SET_INTERNAL_VALUE_POINTER (*bc_prop_p, new_bc_p); - - re_initialize_props (this_obj_p, pattern_string_p, flags); - ecma_free_value (obj_this); + JERRY_ASSERT (current_bc_p != NULL); + ecma_bytecode_deref ((ecma_compiled_code_t *) current_bc_p); + JERRY_ASSERT (target_bc_p != NULL); + ecma_bytecode_ref ((ecma_compiled_code_t *) target_bc_p); + ECMA_SET_INTERNAL_VALUE_POINTER (current_p->u.class_prop.u.value, target_bc_p); + ecma_regexp_initialize_props (this_object_p, + ecma_get_string_from_value (target_bc_p->source), + target_bc_p->header.status_flags); return ecma_copy_value (this_arg); } @@ -175,7 +305,7 @@ ecma_builtin_regexp_prototype_compile (ecma_value_t this_arg, /**< this argument return flags_str_value; } - ecma_value_t parsed_flags_val = re_parse_regexp_flags (ecma_get_string_from_value (flags_str_value), &flags); + ecma_value_t parsed_flags_val = ecma_regexp_parse_flags (ecma_get_string_from_value (flags_str_value), &flags); ecma_free_value (flags_str_value); if (ECMA_IS_VALUE_ERROR (parsed_flags_val)) { @@ -193,26 +323,16 @@ ecma_builtin_regexp_prototype_compile (ecma_value_t this_arg, /**< this argument return bc_val; } - ecma_value_t obj_this = ecma_op_to_object (this_arg); - if (ECMA_IS_VALUE_ERROR (obj_this)) - { - ecma_deref_ecma_string (pattern_string_p); - return obj_this; - } - ecma_object_t *this_obj_p = ecma_get_object_from_value (obj_this); + ecma_object_t *this_obj_p = ecma_get_object_from_value (this_arg); ecma_value_t *bc_prop_p = &(((ecma_extended_object_t *) this_obj_p)->u.class_prop.u.value); re_compiled_code_t *old_bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, *bc_prop_p); - if (old_bc_p != NULL) - { - /* Free the old bytecode */ - ecma_bytecode_deref ((ecma_compiled_code_t *) old_bc_p); - } + JERRY_ASSERT (old_bc_p != NULL); + ecma_bytecode_deref ((ecma_compiled_code_t *) old_bc_p); ECMA_SET_INTERNAL_VALUE_POINTER (*bc_prop_p, new_bc_p); - re_initialize_props (this_obj_p, pattern_string_p, flags); - ecma_free_value (obj_this); + ecma_regexp_initialize_props (this_obj_p, pattern_string_p, flags); ecma_deref_ecma_string (pattern_string_p); return ecma_copy_value (this_arg); @@ -254,26 +374,7 @@ ecma_builtin_regexp_prototype_exec (ecma_value_t this_arg, /**< this argument */ return input_str_value; } - ecma_object_t *obj_p = ecma_get_object_from_value (obj_this); - ecma_value_t *bytecode_prop_p = &(((ecma_extended_object_t *) obj_p)->u.class_prop.u.value); - - void *bytecode_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (void, *bytecode_prop_p); - - ecma_value_t ret_value; - if (bytecode_p == NULL) - { - /* Missing bytecode means empty RegExp: '/(?:)/', so always return empty string. */ - ecma_value_t empty_str_val = ecma_make_magic_string_value (LIT_MAGIC_STRING__EMPTY); - ret_value = ecma_op_create_array_object (&empty_str_val, 1, false); - re_set_result_array_properties (ecma_get_object_from_value (ret_value), - ecma_get_string_from_value (input_str_value), - 1, - 0); - } - else - { - ret_value = ecma_regexp_exec_helper (obj_this, input_str_value, false); - } + ecma_value_t ret_value = ecma_regexp_exec_helper (obj_this, input_str_value, false); ecma_free_value (obj_this); ecma_free_value (input_str_value); @@ -296,15 +397,15 @@ static ecma_value_t ecma_builtin_regexp_prototype_test (ecma_value_t this_arg, /**< this argument */ ecma_value_t arg) /**< routine's argument */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; + ecma_value_t result = ecma_builtin_regexp_prototype_exec (this_arg, arg); - ECMA_TRY_CATCH (match_value, - ecma_builtin_regexp_prototype_exec (this_arg, arg), - ret_value); + if (ECMA_IS_VALUE_ERROR (result)) + { + return result; + } - ret_value = ecma_make_boolean_value (!ecma_is_value_null (match_value)); - - ECMA_FINALIZE (match_value); + ecma_value_t ret_value = ecma_make_boolean_value (!ecma_is_value_null (result)); + ecma_free_value (result); return ret_value; } /* ecma_builtin_regexp_prototype_test */ @@ -321,77 +422,53 @@ ecma_builtin_regexp_prototype_test (ecma_value_t this_arg, /**< this argument */ static ecma_value_t ecma_builtin_regexp_prototype_to_string (ecma_value_t this_arg) /**< this argument */ { - ecma_value_t ret_value = ECMA_VALUE_EMPTY; - if (!ecma_is_value_object (this_arg) || !ecma_object_class_is (ecma_get_object_from_value (this_arg), LIT_MAGIC_STRING_REGEXP_UL)) { - ret_value = ecma_raise_type_error (ECMA_ERR_MSG ("Incomplete RegExp type")); + return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type")); + } + + ecma_object_t *obj_p = ecma_get_object_from_value (this_arg); + ecma_extended_object_t *re_obj_p = (ecma_extended_object_t *) obj_p; + + re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, + re_obj_p->u.class_prop.u.value); + + ecma_string_t *source_p; + uint16_t flags; + + if (bc_p != NULL) + { + source_p = ecma_get_string_from_value (bc_p->source); + flags = bc_p->header.status_flags; } else { - ECMA_TRY_CATCH (obj_this, - ecma_op_to_object (this_arg), - ret_value); - - ecma_object_t *obj_p = ecma_get_object_from_value (obj_this); - - /* Get RegExp source from the source property */ - ecma_string_t *magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_SOURCE); - ecma_value_t source_value = ecma_op_object_get_own_data_prop (obj_p, magic_string_p); - - ecma_string_t *output_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_SLASH_CHAR); - ecma_string_t *source_str_p = ecma_get_string_from_value (source_value); - output_str_p = ecma_concat_ecma_strings (output_str_p, source_str_p); - ecma_deref_ecma_string (source_str_p); - - lit_utf8_byte_t flags[4]; - lit_utf8_byte_t *flags_p = flags; - - *flags_p++ = LIT_CHAR_SLASH; - - /* Check the global flag */ - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_GLOBAL); - ecma_value_t global_value = ecma_op_object_get_own_data_prop (obj_p, magic_string_p); - - JERRY_ASSERT (ecma_is_value_boolean (global_value)); - - if (ecma_is_value_true (global_value)) - { - *flags_p++ = LIT_CHAR_LOWERCASE_G; - } - - /* Check the ignoreCase flag */ - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_IGNORECASE_UL); - ecma_value_t ignore_case_value = ecma_op_object_get_own_data_prop (obj_p, magic_string_p); - - JERRY_ASSERT (ecma_is_value_boolean (ignore_case_value)); - - if (ecma_is_value_true (ignore_case_value)) - { - *flags_p++ = LIT_CHAR_LOWERCASE_I; - } - - /* Check the multiline flag */ - magic_string_p = ecma_get_magic_string (LIT_MAGIC_STRING_MULTILINE); - ecma_value_t multiline_value = ecma_op_object_get_own_data_prop (obj_p, magic_string_p); - - JERRY_ASSERT (ecma_is_value_boolean (multiline_value)); - - if (ecma_is_value_true (multiline_value)) - { - *flags_p++ = LIT_CHAR_LOWERCASE_M; - } - - lit_utf8_size_t size = (lit_utf8_size_t) (flags_p - flags); - output_str_p = ecma_append_chars_to_string (output_str_p, flags, size, size); - - ret_value = ecma_make_string_value (output_str_p); - - ECMA_FINALIZE (obj_this); + source_p = ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP); + flags = RE_FLAG_EMPTY; } - return ret_value; + ecma_stringbuilder_t result = ecma_stringbuilder_create (); + ecma_stringbuilder_append_byte (&result, LIT_CHAR_SLASH); + ecma_stringbuilder_append (&result, source_p); + ecma_stringbuilder_append_byte (&result, LIT_CHAR_SLASH); + + if (flags & RE_FLAG_GLOBAL) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_G); + } + + if (flags & RE_FLAG_IGNORE_CASE) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_I); + } + + if (flags & RE_FLAG_MULTILINE) + { + ecma_stringbuilder_append_byte (&result, LIT_CHAR_LOWERCASE_M); + } + + return ecma_make_string_value (ecma_stringbuilder_finalize (&result)); } /* ecma_builtin_regexp_prototype_to_string */ /** diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h index 97c1d42f0..33a1375bf 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h @@ -26,6 +26,27 @@ OBJECT_VALUE (LIT_MAGIC_STRING_CONSTRUCTOR, ECMA_BUILTIN_ID_REGEXP, ECMA_PROPERTY_CONFIGURABLE_WRITABLE) +#if ENABLED (JERRY_ES2015) +ACCESSOR_READ_ONLY (LIT_MAGIC_STRING_FLAGS, + ecma_builtin_regexp_prototype_get_flags, + ECMA_PROPERTY_FIXED) + +ACCESSOR_READ_ONLY (LIT_MAGIC_STRING_SOURCE, + ecma_builtin_regexp_prototype_get_source, + ECMA_PROPERTY_FIXED) + +ACCESSOR_READ_ONLY (LIT_MAGIC_STRING_GLOBAL, + ecma_builtin_regexp_prototype_get_global, + ECMA_PROPERTY_FIXED) + +ACCESSOR_READ_ONLY (LIT_MAGIC_STRING_IGNORECASE_UL, + ecma_builtin_regexp_prototype_get_ignorecase, + ECMA_PROPERTY_FIXED) + +ACCESSOR_READ_ONLY (LIT_MAGIC_STRING_MULTILINE, + ecma_builtin_regexp_prototype_get_multiline, + ECMA_PROPERTY_FIXED) +#else /* !ENABLED (JERRY_ES2015) */ /* ECMA-262 v5, 15.10.7.1 */ STRING_VALUE (LIT_MAGIC_STRING_SOURCE, LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP, @@ -45,6 +66,7 @@ SIMPLE_VALUE (LIT_MAGIC_STRING_IGNORECASE_UL, SIMPLE_VALUE (LIT_MAGIC_STRING_MULTILINE, ECMA_VALUE_FALSE, ECMA_PROPERTY_FIXED) +#endif /* ENABLED (JERRY_ES2015) */ /* ECMA-262 v5, 15.10.7.5 */ NUMBER_VALUE (LIT_MAGIC_STRING_LASTINDEX_UL, diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.c b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.c index 65f8b0ba0..592aec18b 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.c @@ -110,7 +110,7 @@ ecma_builtin_regexp_dispatch_construct (const ecma_value_t *arguments_list_p, /* ecma_string_t *flags_string_p = ecma_get_string_from_value (flags_str_value); JERRY_ASSERT (flags_string_p != NULL); - ret_value = re_parse_regexp_flags (flags_string_p, &flags); + ret_value = ecma_regexp_parse_flags (flags_string_p, &flags); ecma_free_value (flags_str_value); // implicit frees flags_string_p if (ECMA_IS_VALUE_ERROR (ret_value)) diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index 146c105d5..f3bcb432f 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -40,24 +40,9 @@ */ /** - * RegExp results are stored in an array of string pointers. If N is the number - * of groups then the length of the array is 2*N, because every group has a start - * and end. We have to handle those pointers. - * - * [0] RE global start - * [1] RE global end - * [2] 1st group start - * [3] 1st group end - * ... - * [n] n/2 th group start - * [n+1] n/2 th group end + * Index of the global capturing group */ -#define RE_GLOBAL_START_IDX 0 - -/** - * @copydoc RE_GLOBAL_START_IDX - */ -#define RE_GLOBAL_END_IDX 1 +#define RE_GLOBAL_CAPTURE 0 /** * Check if a RegExp opcode is a capture group or not @@ -75,8 +60,8 @@ * Returned value must be freed with ecma_free_value */ ecma_value_t -re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags */ - uint16_t *flags_p) /**< [out] parsed flag bits */ +ecma_regexp_parse_flags (ecma_string_t *flags_str_p, /**< Input string with flags */ + uint16_t *flags_p) /**< [out] parsed flag bits */ { ecma_value_t ret_value = ECMA_VALUE_EMPTY; @@ -128,77 +113,89 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags ECMA_FINALIZE_UTF8_STRING (flags_start_p, flags_start_size); return ret_value; -} /* re_parse_regexp_flags */ +} /* ecma_regexp_parse_flags */ -/** - * Set a data property value for a regexp object. +/* + * Create the properties of a RegExp instance. */ static void -re_set_data_property (ecma_object_t *re_object_p, /**< RegExp object */ - ecma_string_t *property_name_p, /**< property name */ - uint8_t prop_attributes, /**< property attributes */ - ecma_value_t value) /**< property value */ +ecma_regexp_create_props (ecma_object_t *re_object_p) /**< RegExp object */ +{ +#if !ENABLED (JERRY_ES2015) + ecma_create_named_data_property (re_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_SOURCE), + ECMA_PROPERTY_FIXED, + NULL); + ecma_create_named_data_property (re_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_GLOBAL), + ECMA_PROPERTY_FIXED, + NULL); + ecma_create_named_data_property (re_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_IGNORECASE_UL), + ECMA_PROPERTY_FIXED, + NULL); + ecma_create_named_data_property (re_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_MULTILINE), + ECMA_PROPERTY_FIXED, + NULL); +#endif /* !ENABLED (JERRY_ES2015) */ + ecma_create_named_data_property (re_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), + ECMA_PROPERTY_FLAG_WRITABLE, + NULL); +} /* ecma_regexp_create_props */ + +/* + * Helper function to assign a value to a property + */ +static void +ecma_regexp_helper_assign_prop (ecma_object_t *re_object_p, /**< RegExp object */ + lit_magic_string_id_t prop_id, /**< property name ide */ + ecma_value_t value) /**< value */ { ecma_property_ref_t property_ref; - ecma_property_t property = ecma_op_object_get_own_property (re_object_p, - property_name_p, - &property_ref, - ECMA_PROPERTY_GET_VALUE); - - if (property == ECMA_PROPERTY_TYPE_NOT_FOUND) - { - property_ref.value_p = ecma_create_named_data_property (re_object_p, - property_name_p, - prop_attributes, - NULL); - } - else - { - JERRY_ASSERT (ECMA_PROPERTY_GET_TYPE (property) == ECMA_PROPERTY_TYPE_NAMEDDATA - && !ecma_is_property_configurable (property)); - } - - ecma_named_data_property_assign_value (re_object_p, property_ref.value_p, value); -} /* re_set_data_property */ + ecma_op_object_get_own_property (re_object_p, + ecma_get_magic_string (prop_id), + &property_ref, + ECMA_PROPERTY_GET_VALUE); + ecma_named_data_property_assign_value (re_object_p, + property_ref.value_p, + value); +} /* ecma_regexp_helper_assign_prop */ /** - * Initializes the source, global, ignoreCase, multiline, and lastIndex properties of RegExp instance. + * Initializes the properties of a RegExp instance. */ void -re_initialize_props (ecma_object_t *re_obj_p, /**< RegExp object */ - ecma_string_t *source_p, /**< source string */ - uint16_t flags) /**< flags */ +ecma_regexp_initialize_props (ecma_object_t *re_object_p, /**< RegExp object */ + ecma_string_t *source_p, /**< source string */ + uint16_t flags) /**< flags */ { - /* Set source property. ECMA-262 v5, 15.10.7.1 */ - re_set_data_property (re_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_SOURCE), - ECMA_PROPERTY_FIXED, - ecma_make_string_value (source_p)); +#if !ENABLED (JERRY_ES2015) + ecma_regexp_helper_assign_prop (re_object_p, + LIT_MAGIC_STRING_SOURCE, + ecma_make_string_value (source_p)); - /* Set global property. ECMA-262 v5, 15.10.7.2 */ - re_set_data_property (re_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_GLOBAL), - ECMA_PROPERTY_FIXED, - ecma_make_boolean_value (flags & RE_FLAG_GLOBAL)); + ecma_regexp_helper_assign_prop (re_object_p, + LIT_MAGIC_STRING_GLOBAL, + ecma_make_boolean_value (flags & RE_FLAG_GLOBAL)); - /* Set ignoreCase property. ECMA-262 v5, 15.10.7.3 */ - re_set_data_property (re_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_IGNORECASE_UL), - ECMA_PROPERTY_FIXED, - ecma_make_boolean_value (flags & RE_FLAG_IGNORE_CASE)); + ecma_regexp_helper_assign_prop (re_object_p, + LIT_MAGIC_STRING_IGNORECASE_UL, + ecma_make_boolean_value (flags & RE_FLAG_IGNORE_CASE)); - /* Set multiline property. ECMA-262 v5, 15.10.7.4 */ - re_set_data_property (re_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_MULTILINE), - ECMA_PROPERTY_FIXED, - ecma_make_boolean_value (flags & RE_FLAG_MULTILINE)); + ecma_regexp_helper_assign_prop (re_object_p, + LIT_MAGIC_STRING_MULTILINE, + ecma_make_boolean_value (flags & RE_FLAG_MULTILINE)); +#else /* ENABLED (JERRY_ES2015) */ + JERRY_UNUSED (source_p); + JERRY_UNUSED (flags); +#endif /* !ENABLED (JERRY_ES2015) */ - /* Set lastIndex property. ECMA-262 v5, 15.10.7.5 */ - re_set_data_property (re_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), - ECMA_PROPERTY_FLAG_WRITABLE, - ecma_make_integer_value (0)); -} /* re_initialize_props */ + ecma_regexp_helper_assign_prop (re_object_p, + LIT_MAGIC_STRING_LASTINDEX_UL, + ecma_make_uint32_value (0)); +} /* ecma_regexp_initialize_props */ /** * RegExp object creation operation. @@ -228,10 +225,11 @@ ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p) /**< ECMA_SET_INTERNAL_VALUE_POINTER (ext_object_p->u.class_prop.u.value, bytecode_p); ecma_bytecode_ref ((ecma_compiled_code_t *) bytecode_p); - /* Initialize RegExp object properties */ - re_initialize_props (object_p, - ecma_get_string_from_value (bytecode_p->pattern), - bytecode_p->header.status_flags); + /* Create and initialize RegExp object properties */ + ecma_regexp_create_props (object_p); + ecma_regexp_initialize_props (object_p, + ecma_get_string_from_value (bytecode_p->source), + bytecode_p->header.status_flags); return ecma_make_object_value (object_p); } /* ecma_op_create_regexp_object_from_bytecode */ @@ -260,7 +258,8 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ ecma_extended_object_t *ext_object_p = (ecma_extended_object_t *) object_p; ext_object_p->u.class_prop.class_id = LIT_MAGIC_STRING_UNDEFINED; - re_initialize_props (object_p, pattern_p, flags); + ecma_regexp_create_props (object_p); + ecma_regexp_initialize_props (object_p, pattern_p, flags); /* Compile bytecode. */ const re_compiled_code_t *bc_p = NULL; @@ -280,6 +279,44 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ return ecma_make_object_value (object_p); } /* ecma_op_create_regexp_object */ +/** + * Canonicalize a character + * + * @return ecma_char_t canonicalized character + */ +ecma_char_t +ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */ +{ + if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)) + { + if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z) + { + return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); + } + + return ch; + } + + ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH]; + const ecma_length_t size = lit_char_to_upper_case (ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); + + /* 3. */ + if (size != 1) + { + return ch; + } + /* 4. */ + const ecma_char_t cu = u[0]; + /* 5. */ + if (cu >= 128) + { + /* 6. */ + return cu; + } + + return ch; +} /* ecma_regexp_canonicalize_char */ + /** * RegExp Canonicalize abstract operation * @@ -288,65 +325,37 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ * @return ecma_char_t canonicalized character */ inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE -re_canonicalize (ecma_char_t ch, /**< character */ - bool is_ignorecase) /**< IgnoreCase flag */ +ecma_regexp_canonicalize (ecma_char_t ch, /**< character */ + bool is_ignorecase) /**< IgnoreCase flag */ { - ecma_char_t ret_value = ch; - if (is_ignorecase) { - if (ch < 128) - { - /* ASCII fast path. */ - if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z) - { - ret_value = (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A)); - } - } - else - { - /* 2. */ - ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH]; - ecma_length_t size = lit_char_to_upper_case (ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); - - /* 3. */ - if (size == 1) - { - /* 4. */ - ecma_char_t cu = u[0]; - /* 5. */ - if (cu >= 128) - { - /* 6. */ - ret_value = cu; - } - } - } + return ecma_regexp_canonicalize_char (ch); } - return ret_value; -} /* re_canonicalize */ + return ch; +} /* ecma_regexp_canonicalize */ /** - * Recursive function for RegExp matching. Tests for a regular expression - * match and returns a MatchResult value. + * Recursive function for RegExp matching. * * See also: * ECMA-262 v5, 15.10.2.1 * * @return true - if matched * false - otherwise - * - * May raise error, so returned value must be freed with ecma_free_value */ -static ecma_value_t -re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ - uint8_t *bc_p, /**< pointer to the current RegExp bytecode */ - const lit_utf8_byte_t *str_p, /**< input string pointer */ - const lit_utf8_byte_t **out_str_p) /**< [out] matching substring iterator */ +static const lit_utf8_byte_t * +ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ + const uint8_t *bc_p, /**< pointer to the current RegExp bytecode */ + const lit_utf8_byte_t *str_curr_p) /**< input string pointer */ { - ECMA_CHECK_STACK_USAGE (); - const lit_utf8_byte_t *str_curr_p = str_p; +#if (JERRY_STACK_LIMIT != 0) + if (JERRY_UNLIKELY (ecma_get_current_stack_usage () > CONFIG_MEM_STACK_LIMIT)) + { + return ECMA_RE_OUT_OF_STACK; + } +#endif /* JERRY_STACK_LIMIT != 0 */ while (true) { @@ -357,45 +366,43 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_MATCH: { JERRY_TRACE_MSG ("Execute RE_OP_MATCH: match\n"); - *out_str_p = str_curr_p; - return ECMA_VALUE_TRUE; /* match */ + return str_curr_p; } case RE_OP_CHAR: { if (str_curr_p >= re_ctx_p->input_end_p) { - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } - bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; + const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; ecma_char_t ch1 = (ecma_char_t) re_get_char (&bc_p); /* Already canonicalized. */ - ecma_char_t ch2 = re_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); + ecma_char_t ch2 = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2); if (ch1 != ch2) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } JERRY_TRACE_MSG ("match\n"); - break; /* tail merge */ } case RE_OP_PERIOD: { if (str_curr_p >= re_ctx_p->input_end_p) { - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } - ecma_char_t ch = lit_utf8_read_next (&str_curr_p); + const ecma_char_t ch = lit_utf8_read_next (&str_curr_p); JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch); if (lit_char_is_line_terminator (ch)) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } JERRY_TRACE_MSG ("match\n"); @@ -414,7 +421,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (!(re_ctx_p->flags & RE_FLAG_MULTILINE)) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } if (lit_char_is_line_terminator (lit_utf8_peek_prev (str_curr_p))) @@ -424,7 +431,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } case RE_OP_ASSERT_END: { @@ -439,7 +446,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (!(re_ctx_p->flags & RE_FLAG_MULTILINE)) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } if (lit_char_is_line_terminator (lit_utf8_peek_next (str_curr_p))) @@ -449,30 +456,16 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } case RE_OP_ASSERT_WORD_BOUNDARY: case RE_OP_ASSERT_NOT_WORD_BOUNDARY: { - bool is_wordchar_left, is_wordchar_right; + const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p) + && lit_char_is_word_char (lit_utf8_peek_prev (str_curr_p))); - if (str_curr_p <= re_ctx_p->input_start_p) - { - is_wordchar_left = false; /* not a wordchar */ - } - else - { - is_wordchar_left = lit_char_is_word_char (lit_utf8_peek_prev (str_curr_p)); - } - - if (str_curr_p >= re_ctx_p->input_end_p) - { - is_wordchar_right = false; /* not a wordchar */ - } - else - { - is_wordchar_right = lit_char_is_word_char (lit_utf8_peek_next (str_curr_p)); - } + const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p) + && lit_char_is_word_char (lit_utf8_peek_next (str_curr_p))); if (op == RE_OP_ASSERT_WORD_BOUNDARY) { @@ -480,7 +473,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (is_wordchar_left == is_wordchar_right) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } } else @@ -491,7 +484,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (is_wordchar_left != is_wordchar_right) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } } @@ -501,203 +494,160 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_LOOKAHEAD_POS: case RE_OP_LOOKAHEAD_NEG: { - ecma_value_t match_value = ECMA_VALUE_EMPTY; - const lit_utf8_byte_t *sub_str_p = NULL; - - uint32_t array_size = re_ctx_p->num_of_captures + re_ctx_p->num_of_non_captures; - JMEM_DEFINE_LOCAL_ARRAY (saved_bck_p, array_size, lit_utf8_byte_t *); - - size_t size = (size_t) (array_size) * sizeof (lit_utf8_byte_t *); - memcpy (saved_bck_p, re_ctx_p->saved_p, size); + const lit_utf8_byte_t *matched_p = NULL; + const size_t captures_size = re_ctx_p->captures_count * sizeof (ecma_regexp_capture_t); + ecma_regexp_capture_t *saved_captures_p = (ecma_regexp_capture_t *) jmem_heap_alloc_block (captures_size); + memcpy (saved_captures_p, re_ctx_p->captures_p, captures_size); do { - uint32_t offset = re_get_value (&bc_p); + const uint32_t offset = re_get_value (&bc_p); - if (!sub_str_p) + if (matched_p == NULL) { - match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); - if (ECMA_IS_VALUE_ERROR (match_value)) + matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) { - break; + jmem_heap_free_block (saved_captures_p, captures_size); + return matched_p; } } bc_p += offset; } while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - if (!ECMA_IS_VALUE_ERROR (match_value)) + JERRY_TRACE_MSG ("Execute RE_OP_LOOKAHEAD_POS/NEG: "); + if ((op == RE_OP_LOOKAHEAD_POS && matched_p != NULL) + || (op == RE_OP_LOOKAHEAD_NEG && matched_p == NULL)) { - JERRY_TRACE_MSG ("Execute RE_OP_LOOKAHEAD_POS/NEG: "); - ecma_free_value (match_value); - if ((op == RE_OP_LOOKAHEAD_POS && sub_str_p) - || (op == RE_OP_LOOKAHEAD_NEG && !sub_str_p)) - { - JERRY_TRACE_MSG ("match\n"); - match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); - } - else - { - JERRY_TRACE_MSG ("fail\n"); - match_value = ECMA_VALUE_FALSE; /* fail */ - } + JERRY_TRACE_MSG ("match\n"); + matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); + } + else + { + JERRY_TRACE_MSG ("fail\n"); + matched_p = NULL; /* fail */ } - if (!ECMA_IS_VALUE_ERROR (match_value)) + if (matched_p == NULL) { - if (ecma_is_value_true (match_value)) - { - *out_str_p = sub_str_p; - } - else - { - JERRY_ASSERT (ecma_is_value_boolean (match_value)); - /* restore saved */ - memcpy (re_ctx_p->saved_p, saved_bck_p, size); - } + /* restore saved */ + memcpy (re_ctx_p->captures_p, saved_captures_p, captures_size); } - JMEM_FINALIZE_LOCAL_ARRAY (saved_bck_p); - return match_value; + jmem_heap_free_block (saved_captures_p, captures_size); + return matched_p; } case RE_OP_CHAR_CLASS: case RE_OP_INV_CHAR_CLASS: { - uint32_t num_of_ranges; - bool is_match; - JERRY_TRACE_MSG ("Execute RE_OP_CHAR_CLASS/RE_OP_INV_CHAR_CLASS, "); if (str_curr_p >= re_ctx_p->input_end_p) { JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } - bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; - ecma_char_t curr_ch = re_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); + const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; + const ecma_char_t curr_ch = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); - num_of_ranges = re_get_value (&bc_p); - is_match = false; + uint32_t range_count = re_get_value (&bc_p); + bool is_match = false; - while (num_of_ranges) + while (range_count-- > 0) { - ecma_char_t ch1 = re_canonicalize (re_get_char (&bc_p), is_ignorecase); - ecma_char_t ch2 = re_canonicalize (re_get_char (&bc_p), is_ignorecase); - JERRY_TRACE_MSG ("num_of_ranges=%u, ch1=%u, ch2=%u, curr_ch=%u; ", - (unsigned int) num_of_ranges, (unsigned int) ch1, - (unsigned int) ch2, (unsigned int) curr_ch); - - if (curr_ch >= ch1 && curr_ch <= ch2) + const ecma_char_t ch1 = re_get_char (&bc_p); + if (curr_ch < ch1) { - /* We must read all the ranges from bytecode. */ - is_match = true; + bc_p += sizeof (ecma_char_t); + continue; } - num_of_ranges--; - } - if (op == RE_OP_CHAR_CLASS) - { - if (!is_match) - { - JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ - } - } - else - { - JERRY_ASSERT (op == RE_OP_INV_CHAR_CLASS); + const ecma_char_t ch2 = re_get_char (&bc_p); + is_match = (curr_ch <= ch2); if (is_match) { - JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ + /* Skip the remaining ranges in the bytecode. */ + bc_p += range_count * 2 * sizeof (ecma_char_t); + break; } } + + JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS); + + if ((op == RE_OP_CHAR_CLASS) != is_match) + { + JERRY_TRACE_MSG ("fail\n"); + return NULL; /* fail */ + } + JERRY_TRACE_MSG ("match\n"); break; /* tail merge */ } case RE_OP_BACKREFERENCE: { - uint32_t backref_idx; - - backref_idx = re_get_value (&bc_p); + const uint32_t backref_idx = re_get_value (&bc_p); JERRY_TRACE_MSG ("Execute RE_OP_BACKREFERENCE (idx: %u): ", (unsigned int) backref_idx); - backref_idx *= 2; /* backref n -> saved indices [n*2, n*2+1] */ - JERRY_ASSERT (backref_idx >= 2 && backref_idx + 1 < re_ctx_p->num_of_captures); + JERRY_ASSERT (backref_idx >= 1 && backref_idx < re_ctx_p->captures_count); + const ecma_regexp_capture_t capture = re_ctx_p->captures_p[backref_idx]; - if (!re_ctx_p->saved_p[backref_idx] || !re_ctx_p->saved_p[backref_idx + 1]) + if (capture.begin_p == NULL || capture.end_p == NULL) { JERRY_TRACE_MSG ("match\n"); break; /* capture is 'undefined', always matches! */ } - const lit_utf8_byte_t *sub_str_p = re_ctx_p->saved_p[backref_idx]; + const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p); - while (sub_str_p < re_ctx_p->saved_p[backref_idx + 1]) + if (str_curr_p + capture_size > re_ctx_p->input_end_p) { - ecma_char_t ch1, ch2; - - if (str_curr_p >= re_ctx_p->input_end_p) - { - JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ - } - - ch1 = lit_utf8_read_next (&sub_str_p); - ch2 = lit_utf8_read_next (&str_curr_p); - - if (ch1 != ch2) - { - JERRY_TRACE_MSG ("fail\n"); - return ECMA_VALUE_FALSE; /* fail */ - } + JERRY_TRACE_MSG ("fail\n"); + return NULL; /* fail */ } + + if (memcmp (str_curr_p, capture.begin_p, capture_size)) + { + JERRY_TRACE_MSG ("fail\n"); + return NULL; /* fail */ + } + + str_curr_p += capture_size; JERRY_TRACE_MSG ("match\n"); break; /* tail merge */ } case RE_OP_SAVE_AT_START: { - uint8_t *old_bc_p; - JERRY_TRACE_MSG ("Execute RE_OP_SAVE_AT_START\n"); - const lit_utf8_byte_t *old_start_p = re_ctx_p->saved_p[RE_GLOBAL_START_IDX]; - re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = str_curr_p; + re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].begin_p = str_curr_p; do { - uint32_t offset = re_get_value (&bc_p); - const lit_utf8_byte_t *sub_str_p = NULL; - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const uint32_t offset = re_get_value (&bc_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } bc_p += offset; - old_bc_p = bc_p; } while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - bc_p = old_bc_p; + bc_p -= sizeof (uint8_t); - re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = old_start_p; - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } case RE_OP_SAVE_AND_MATCH: { JERRY_TRACE_MSG ("End of pattern is reached: match\n"); - re_ctx_p->saved_p[RE_GLOBAL_END_IDX] = str_curr_p; - *out_str_p = str_curr_p; - return ECMA_VALUE_TRUE; /* match */ + re_ctx_p->captures_p[RE_GLOBAL_CAPTURE].end_p = str_curr_p; + return str_curr_p; /* match */ } case RE_OP_ALTERNATIVE: { /* - * Alternatives should be jump over, when alternative opcode appears. + * Alternatives should be jumped over, when an alternative opcode appears. */ uint32_t offset = re_get_value (&bc_p); JERRY_TRACE_MSG ("Execute RE_OP_ALTERNATIVE"); @@ -721,53 +671,42 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ * On non-greedy iterations we have to execute the bytecode * after the group first, if zero iteration is allowed. */ - uint32_t start_idx, iter_idx, offset; - const lit_utf8_byte_t *old_start_p = NULL; - const lit_utf8_byte_t *sub_str_p = NULL; - uint8_t *old_bc_p; - - old_bc_p = bc_p; /* save the bytecode start position of the group start */ - start_idx = re_get_value (&bc_p); - offset = re_get_value (&bc_p); + const lit_utf8_byte_t *old_begin_p = NULL; + const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group start */ + const uint32_t start_idx = re_get_value (&bc_p); + const uint32_t offset = re_get_value (&bc_p); + uint32_t *iterator_p; if (RE_IS_CAPTURE_GROUP (op)) { - JERRY_ASSERT (start_idx <= re_ctx_p->num_of_captures / 2); - iter_idx = start_idx - 1; - start_idx *= 2; - - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_curr_p; + JERRY_ASSERT (start_idx < re_ctx_p->captures_count); + re_ctx_p->captures_p[start_idx].begin_p = str_curr_p; + iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]); } else { - JERRY_ASSERT (start_idx < re_ctx_p->num_of_non_captures); - iter_idx = start_idx + (re_ctx_p->num_of_captures / 2) - 1; - start_idx += re_ctx_p->num_of_captures; + JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count); + iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]); } - re_ctx_p->num_of_iterations_p[iter_idx] = 0; + *iterator_p = 0; /* Jump all over to the end of the END opcode. */ bc_p += offset; /* Try to match after the close paren if zero is allowed */ - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return str_curr_p; /* match */ } + if (RE_IS_CAPTURE_GROUP (op)) { - re_ctx_p->saved_p[start_idx] = old_start_p; + re_ctx_p->captures_p[start_idx].begin_p = old_begin_p; } - bc_p = old_bc_p; + bc_p = bc_start_p; /* FALLTHRU */ } case RE_OP_CAPTURE_GROUP_START: @@ -775,200 +714,172 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_NON_CAPTURE_GROUP_START: case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START: { - uint32_t start_idx, iter_idx, old_iteration_cnt, offset; - const lit_utf8_byte_t *sub_str_p = NULL; - uint8_t *old_bc_p; - uint8_t *end_bc_p = NULL; - start_idx = re_get_value (&bc_p); + const uint8_t *bc_end_p = NULL; + const uint32_t start_idx = re_get_value (&bc_p); if (op != RE_OP_CAPTURE_GROUP_START && op != RE_OP_NON_CAPTURE_GROUP_START) { - offset = re_get_value (&bc_p); - end_bc_p = bc_p + offset; + const uint32_t offset = re_get_value (&bc_p); + bc_end_p = bc_p + offset; } + const lit_utf8_byte_t **group_begin_p; + uint32_t *iterator_p; if (RE_IS_CAPTURE_GROUP (op)) { - JERRY_ASSERT (start_idx <= re_ctx_p->num_of_captures / 2); - iter_idx = start_idx - 1; - start_idx *= 2; + JERRY_ASSERT (start_idx < re_ctx_p->captures_count); + group_begin_p = &(re_ctx_p->captures_p[start_idx].begin_p); + iterator_p = &(re_ctx_p->iterations_p[start_idx - 1]); } else { - JERRY_ASSERT (start_idx < re_ctx_p->num_of_non_captures); - iter_idx = start_idx + (re_ctx_p->num_of_captures / 2) - 1; - start_idx += re_ctx_p->num_of_captures; + JERRY_ASSERT (start_idx < re_ctx_p->non_captures_count); + group_begin_p = &(re_ctx_p->non_captures_p[start_idx].str_p); + iterator_p = &(re_ctx_p->iterations_p[start_idx + re_ctx_p->captures_count - 1]); } - const lit_utf8_byte_t *old_start_p = re_ctx_p->saved_p[start_idx]; - old_iteration_cnt = re_ctx_p->num_of_iterations_p[iter_idx]; - re_ctx_p->saved_p[start_idx] = str_curr_p; - re_ctx_p->num_of_iterations_p[iter_idx] = 0; + const lit_utf8_byte_t *const old_begin_p = *group_begin_p; + const uint32_t old_iter_count = *iterator_p; + *group_begin_p = str_curr_p; + *iterator_p = 0; do { - offset = re_get_value (&bc_p); - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const uint32_t offset = re_get_value (&bc_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } bc_p += offset; - old_bc_p = bc_p; } while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); - bc_p = old_bc_p; - re_ctx_p->num_of_iterations_p[iter_idx] = old_iteration_cnt; + + bc_p -= sizeof (uint8_t); + *iterator_p = old_iter_count; /* Try to match after the close paren if zero is allowed. */ if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START || op == RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START) { - JERRY_ASSERT (end_bc_p); - ecma_value_t match_value = re_match_regexp (re_ctx_p, end_bc_p, str_curr_p, &sub_str_p); + JERRY_ASSERT (bc_end_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_end_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } } - re_ctx_p->saved_p[start_idx] = old_start_p; - return ECMA_VALUE_FALSE; /* fail */ + *group_begin_p = old_begin_p; + return NULL; /* fail */ } case RE_OP_CAPTURE_NON_GREEDY_GROUP_END: case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END: { - uint32_t end_idx, iter_idx, min, max; - uint8_t *old_bc_p; - /* * On non-greedy iterations we have to execute the bytecode * after the group first. Try to iterate only if it fails. */ - old_bc_p = bc_p; /* save the bytecode start position of the group end */ - end_idx = re_get_value (&bc_p); - min = re_get_value (&bc_p); - max = re_get_value (&bc_p); + const uint8_t *const bc_start_p = bc_p; /* save the bytecode start position of the group end */ + const uint32_t end_idx = re_get_value (&bc_p); + const uint32_t min = re_get_value (&bc_p); + const uint32_t max = re_get_value (&bc_p); re_get_value (&bc_p); /* start offset */ + const lit_utf8_byte_t **group_end_p; + uint32_t *iterator_p; if (RE_IS_CAPTURE_GROUP (op)) { - JERRY_ASSERT (end_idx <= re_ctx_p->num_of_captures / 2); - iter_idx = end_idx - 1; - end_idx = (end_idx * 2) + 1; + JERRY_ASSERT (end_idx < re_ctx_p->captures_count); + group_end_p = &(re_ctx_p->captures_p[end_idx].end_p); + iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]); } else { - JERRY_ASSERT (end_idx <= re_ctx_p->num_of_non_captures); - iter_idx = end_idx + (re_ctx_p->num_of_captures / 2) - 1; - end_idx += re_ctx_p->num_of_captures; + JERRY_ASSERT (end_idx < re_ctx_p->non_captures_count); + group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p); + iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]); } - re_ctx_p->num_of_iterations_p[iter_idx]++; + (*iterator_p)++; - if (re_ctx_p->num_of_iterations_p[iter_idx] >= min - && re_ctx_p->num_of_iterations_p[iter_idx] <= max) + if (*iterator_p >= min && *iterator_p <= max) { - const lit_utf8_byte_t *old_end_p = re_ctx_p->saved_p[end_idx]; - re_ctx_p->saved_p[end_idx] = str_curr_p; + const lit_utf8_byte_t *const old_end_p = *group_end_p; + *group_end_p = str_curr_p; - const lit_utf8_byte_t *sub_str_p = NULL; - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } - re_ctx_p->saved_p[end_idx] = old_end_p; + *group_end_p = old_end_p; } - re_ctx_p->num_of_iterations_p[iter_idx]--; - bc_p = old_bc_p; + (*iterator_p)--; + bc_p = bc_start_p; - /* If non-greedy fails and try to iterate... */ + /* Non-greedy fails, try to iterate. */ /* FALLTHRU */ } case RE_OP_CAPTURE_GREEDY_GROUP_END: case RE_OP_NON_CAPTURE_GREEDY_GROUP_END: { - uint32_t start_idx, end_idx, iter_idx, min, max, offset; - const lit_utf8_byte_t *old_start_p = NULL; - const lit_utf8_byte_t *old_end_p = NULL; - const lit_utf8_byte_t *sub_str_p = NULL; - uint8_t *old_bc_p; + const uint32_t end_idx = re_get_value (&bc_p); + const uint32_t min = re_get_value (&bc_p); + const uint32_t max = re_get_value (&bc_p); + uint32_t offset = re_get_value (&bc_p); - end_idx = re_get_value (&bc_p); - min = re_get_value (&bc_p); - max = re_get_value (&bc_p); - offset = re_get_value (&bc_p); + const lit_utf8_byte_t **group_begin_p; + const lit_utf8_byte_t **group_end_p; + uint32_t *iterator_p; if (RE_IS_CAPTURE_GROUP (op)) { - JERRY_ASSERT (end_idx <= re_ctx_p->num_of_captures / 2); - iter_idx = end_idx - 1; - start_idx = end_idx * 2; - end_idx = start_idx + 1; + JERRY_ASSERT (end_idx < re_ctx_p->captures_count); + group_begin_p = &(re_ctx_p->captures_p[end_idx].begin_p); + group_end_p = &(re_ctx_p->captures_p[end_idx].end_p); + iterator_p = &(re_ctx_p->iterations_p[end_idx - 1]); } else { - JERRY_ASSERT (end_idx <= re_ctx_p->num_of_non_captures); - iter_idx = end_idx + (re_ctx_p->num_of_captures / 2) - 1; - end_idx += re_ctx_p->num_of_captures; - start_idx = end_idx; + JERRY_ASSERT (end_idx <= re_ctx_p->non_captures_count); + group_begin_p = &(re_ctx_p->non_captures_p[end_idx].str_p); + group_end_p = &(re_ctx_p->non_captures_p[end_idx].str_p); + iterator_p = &(re_ctx_p->iterations_p[end_idx + re_ctx_p->captures_count - 1]); } /* Check the empty iteration if the minimum number of iterations is reached. */ - if (re_ctx_p->num_of_iterations_p[iter_idx] >= min - && str_curr_p== re_ctx_p->saved_p[start_idx]) + if (*iterator_p >= min && str_curr_p == *group_begin_p) { - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } - re_ctx_p->num_of_iterations_p[iter_idx]++; + (*iterator_p)++; - old_bc_p = bc_p; /* Save the bytecode end position of the END opcodes for matching after it. */ - old_end_p = re_ctx_p->saved_p[end_idx]; - re_ctx_p->saved_p[end_idx] = str_curr_p; + const uint8_t *const bc_start_p = bc_p; /* Save the bytecode end position of the END opcodes. */ + const lit_utf8_byte_t *const old_end_p = *group_end_p; + *group_end_p = str_curr_p; - if (re_ctx_p->num_of_iterations_p[iter_idx] < max) + if (*iterator_p < max) { bc_p -= offset; offset = re_get_value (&bc_p); - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_curr_p; - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const old_begin_p = *group_begin_p; + *group_begin_p = str_curr_p; - if (ecma_is_value_true (match_value)) - { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; - } + const lit_utf8_byte_t *matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - re_ctx_p->saved_p[start_idx] = old_start_p; + if (matched_p != NULL) + { + return matched_p; /* match */ + } /* Try to match alternatives if any. */ bc_p += offset; @@ -977,190 +888,178 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ bc_p++; /* RE_OP_ALTERNATIVE */ offset = re_get_value (&bc_p); - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_curr_p; + *group_begin_p = str_curr_p; - match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } - re_ctx_p->saved_p[start_idx] = old_start_p; bc_p += offset; } + + *group_begin_p = old_begin_p; } - if (re_ctx_p->num_of_iterations_p[iter_idx] >= min - && re_ctx_p->num_of_iterations_p[iter_idx] <= max) + if (*iterator_p >= min && *iterator_p <= max) { /* Try to match the rest of the bytecode. */ - ecma_value_t match_value = re_match_regexp (re_ctx_p, old_bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_start_p, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } } /* restore if fails */ - re_ctx_p->saved_p[end_idx] = old_end_p; - re_ctx_p->num_of_iterations_p[iter_idx]--; - return ECMA_VALUE_FALSE; /* fail */ + *group_end_p = old_end_p; + (*iterator_p)--; + return NULL; /* fail */ } case RE_OP_NON_GREEDY_ITERATOR: { - uint32_t min, max, offset, num_of_iter; - const lit_utf8_byte_t *sub_str_p = NULL; + const uint32_t min = re_get_value (&bc_p); + const uint32_t max = re_get_value (&bc_p); - min = re_get_value (&bc_p); - max = re_get_value (&bc_p); - - offset = re_get_value (&bc_p); + const uint32_t offset = re_get_value (&bc_p); JERRY_TRACE_MSG ("Non-greedy iterator, min=%lu, max=%lu, offset=%ld\n", (unsigned long) min, (unsigned long) max, (long) offset); - num_of_iter = 0; - while (num_of_iter <= max) + uint32_t iter_count = 0; + while (iter_count <= max) { - if (num_of_iter >= min) + if (iter_count >= min) { - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p); - if (ecma_is_value_true (match_value)) + if (matched_p != NULL) { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; + return matched_p; /* match */ } } - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (!ecma_is_value_true (match_value)) + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) { - if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; - } + return matched_p; + } + if (matched_p == NULL) + { break; } - str_curr_p = sub_str_p; - num_of_iter++; + str_curr_p = matched_p; + iter_count++; } - return ECMA_VALUE_FALSE; /* fail */ + + return NULL; /* fail */ } default: { JERRY_ASSERT (op == RE_OP_GREEDY_ITERATOR); - uint32_t min, max, offset, num_of_iter; - const lit_utf8_byte_t *sub_str_p = NULL; + const uint32_t min = re_get_value (&bc_p); + const uint32_t max = re_get_value (&bc_p); - min = re_get_value (&bc_p); - max = re_get_value (&bc_p); - - offset = re_get_value (&bc_p); + const uint32_t offset = re_get_value (&bc_p); JERRY_TRACE_MSG ("Greedy iterator, min=%lu, max=%lu, offset=%ld\n", (unsigned long) min, (unsigned long) max, (long) offset); - num_of_iter = 0; - - while (num_of_iter < max) + uint32_t iter_count = 0; + while (iter_count < max) { - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_curr_p, &sub_str_p); + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p, str_curr_p); - if (!ecma_is_value_true (match_value)) + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) { - if (ECMA_IS_VALUE_ERROR (match_value)) + return matched_p; + } + + if (matched_p == NULL) + { + break; + } + + str_curr_p = matched_p; + iter_count++; + } + + if (iter_count >= min) + { + while (true) + { + const lit_utf8_byte_t *const matched_p = ecma_regexp_match (re_ctx_p, bc_p + offset, str_curr_p); + + if (matched_p != NULL) { - return match_value; + return matched_p; /* match */ } - break; - } + if (iter_count == min) + { + break; + } - str_curr_p = sub_str_p; - num_of_iter++; + lit_utf8_read_prev (&str_curr_p); + iter_count--; + } } - while (num_of_iter >= min) - { - ecma_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_curr_p, &sub_str_p); - - if (ecma_is_value_true (match_value)) - { - *out_str_p = sub_str_p; - return match_value; /* match */ - } - else if (ECMA_IS_VALUE_ERROR (match_value)) - { - return match_value; - } - - if (num_of_iter == min) - { - break; - } - - lit_utf8_read_prev (&str_curr_p); - num_of_iter--; - } - return ECMA_VALUE_FALSE; /* fail */ + return NULL; /* fail */ } } } -} /* re_match_regexp */ +} /* ecma_regexp_match */ -/** - * Define the necessary properties for the result array (index, input, length). - */ -void -re_set_result_array_properties (ecma_object_t *array_obj_p, /**< result array */ - ecma_string_t *input_str_p, /**< input string */ - uint32_t num_of_elements, /**< Number of array elements */ - int32_t index) /**< index of matching */ +static ecma_value_t +ecma_regexp_create_result_object (ecma_regexp_ctx_t *re_ctx_p, + ecma_string_t *input_string_p, + uint32_t index) { - /* Set index property of the result array */ - ecma_builtin_helper_def_prop (array_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_INDEX), - ecma_make_int32_value (index), - ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE | ECMA_IS_THROW); + ecma_value_t result_array = ecma_op_create_array_object (0, 0, false); + ecma_object_t *result_p = ecma_get_object_from_value (result_array); - /* Set input property of the result array */ - ecma_builtin_helper_def_prop (array_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_INPUT), - ecma_make_string_value (input_str_p), - ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE | ECMA_IS_THROW); - - /* Set length property of the result array */ + for (uint32_t i = 0; i < re_ctx_p->captures_count; i++) { - ecma_property_descriptor_t array_item_prop_desc = ecma_make_empty_property_descriptor (); - array_item_prop_desc.flags |= (ECMA_PROP_IS_VALUE_DEFINED | ECMA_PROP_IS_THROW); + const ecma_regexp_capture_t capture = re_ctx_p->captures_p[i]; - array_item_prop_desc.value = ecma_make_uint32_value (num_of_elements); - - ecma_op_object_define_own_property (array_obj_p, - ecma_get_magic_string (LIT_MAGIC_STRING_LENGTH), - &array_item_prop_desc); + if (capture.begin_p != NULL && capture.end_p >= capture.begin_p) + { + const lit_utf8_size_t capture_size = (lit_utf8_size_t) (capture.end_p - capture.begin_p); + ecma_string_t *const capture_str_p = ecma_new_ecma_string_from_utf8 (capture.begin_p, capture_size); + const ecma_value_t capture_value = ecma_make_string_value (capture_str_p); + ecma_builtin_helper_def_prop_by_index (result_p, + i, + capture_value, + ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE); + ecma_deref_ecma_string (capture_str_p); + } + else + { + ecma_builtin_helper_def_prop_by_index (result_p, + i, + ECMA_VALUE_UNDEFINED, + ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE); + } } -} /* re_set_result_array_properties */ + + ecma_builtin_helper_def_prop (result_p, + ecma_get_magic_string (LIT_MAGIC_STRING_INDEX), + ecma_make_uint32_value (index), + ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE); + + ecma_builtin_helper_def_prop (result_p, + ecma_get_magic_string (LIT_MAGIC_STRING_INPUT), + ecma_make_string_value (input_string_p), + ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE); + + return result_array; +} /* ecma_regexp_create_result_object */ /** * RegExp helper function to start the recursive matching algorithm @@ -1193,22 +1092,29 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, ext_object_p->u.class_prop.u.value); + ecma_regexp_ctx_t re_ctx; + ecma_string_t *input_string_p = ecma_get_string_from_value (input_string); + if (bc_p == NULL) { +#if ENABLED (JERRY_ES2015) return ecma_raise_type_error (ECMA_ERR_MSG ("Incompatible type")); +#else /* !ENABLED (JERRY_ES2015) */ + /* Missing bytecode means the RegExp object is the RegExp.prototype, + * which will always result in an empty string match. */ + re_ctx.captures_count = 1; + + re_ctx.captures_p = jmem_heap_alloc_block (sizeof (ecma_regexp_capture_t)); + re_ctx.captures_p->begin_p = lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY); + re_ctx.captures_p->end_p = lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY); + + ret_value = ecma_regexp_create_result_object (&re_ctx, input_string_p, 0); + + jmem_heap_free_block (re_ctx.captures_p, sizeof (ecma_regexp_capture_t)); + return ret_value; +#endif /* ENABLED (JERRY_ES2015) */ } - ecma_string_t *input_string_p = ecma_get_string_from_value (input_string); - ECMA_STRING_TO_UTF8_STRING (input_string_p, input_buffer_p, input_buffer_size); - - re_matcher_ctx_t re_ctx; - const lit_utf8_byte_t *input_curr_p = input_buffer_p; - - re_ctx.input_start_p = input_curr_p; - const lit_utf8_byte_t *input_end_p = re_ctx.input_start_p + input_buffer_size; - re_ctx.input_end_p = input_end_p; - - /* 1. Read bytecode header and init regexp matcher context. */ re_ctx.flags = bc_p->header.status_flags; if (ignore_global) @@ -1216,209 +1122,194 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ re_ctx.flags &= (uint16_t) ~RE_FLAG_GLOBAL; } - JERRY_TRACE_MSG ("Exec with flags [global: %d, ignoreCase: %d, multiline: %d]\n", - re_ctx.flags & RE_FLAG_GLOBAL, - re_ctx.flags & RE_FLAG_IGNORE_CASE, - re_ctx.flags & RE_FLAG_MULTILINE); + lit_utf8_size_t input_size; + lit_utf8_size_t input_length; + uint8_t input_flags = ECMA_STRING_FLAG_IS_ASCII; + const lit_utf8_byte_t *input_buffer_p = ecma_string_get_chars (input_string_p, + &input_size, + &input_length, + NULL, + &input_flags); - re_ctx.num_of_captures = bc_p->num_of_captures; - JERRY_ASSERT (re_ctx.num_of_captures % 2 == 0); - re_ctx.num_of_non_captures = bc_p->num_of_non_captures; - - JMEM_DEFINE_LOCAL_ARRAY (saved_p, re_ctx.num_of_captures + re_ctx.num_of_non_captures, const lit_utf8_byte_t *); - - for (uint32_t i = 0; i < re_ctx.num_of_captures + re_ctx.num_of_non_captures; i++) + const lit_utf8_byte_t *input_curr_p = input_buffer_p; + uint32_t index = 0; + if (re_ctx.flags & RE_FLAG_GLOBAL) { - saved_p[i] = NULL; - } - re_ctx.saved_p = saved_p; + ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); + ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p); - uint32_t num_of_iter_length = (re_ctx.num_of_captures / 2) + (re_ctx.num_of_non_captures - 1); - JMEM_DEFINE_LOCAL_ARRAY (num_of_iter_p, num_of_iter_length, uint32_t); + ecma_number_t lastindex_num; + ret_value = ecma_get_number (lastindex_value, &lastindex_num); + ecma_free_value (lastindex_value); - for (uint32_t i = 0; i < num_of_iter_length; i++) - { - num_of_iter_p[i] = 0u; - } - - bool is_match = false; - re_ctx.num_of_iterations_p = num_of_iter_p; - int32_t index = 0; - ecma_length_t input_str_len; - - input_str_len = ecma_string_get_length (input_string_p); - - if (input_buffer_p && (re_ctx.flags & RE_FLAG_GLOBAL)) - { - ecma_string_t *magic_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); - ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, magic_str_p); - - ECMA_OP_TO_NUMBER_TRY_CATCH (lastindex_num, lastindex_value, ret_value) - - index = ecma_number_to_int32 (lastindex_num); - - if (input_curr_p < input_end_p - && index <= (int32_t) input_str_len - && index > 0) + if (ECMA_IS_VALUE_ERROR (ret_value)) { - if (input_str_len == input_buffer_size) + goto cleanup_string; + } + + /* TODO: Replace with ToLength */ + if (lastindex_num < 0.0f) + { +#if ENABLED (JERRY_ES2015) + lastindex_num = 0.0f; +#else /* !ENABLED (JERRY_ES2015) */ + lastindex_num = input_length + 1; +#endif /* ENABLED (JERRY_ES2015) */ + } + index = ecma_number_to_uint32 (lastindex_num); + + if (index > input_length) + { + ret_value = ecma_op_object_put (regexp_object_p, + lastindex_str_p, + ecma_make_integer_value (0), + true); + + if (!ECMA_IS_VALUE_ERROR (ret_value)) + { + JERRY_ASSERT (ecma_is_value_boolean (ret_value)); + /* lastIndex is out of bounds, the match should fail. */ + ret_value = ECMA_VALUE_NULL; + } + + goto cleanup_string; + } + + if (index > 0) + { + if (input_flags & ECMA_STRING_FLAG_IS_ASCII) { input_curr_p += index; } else { - for (int i = 0; i < index; i++) + for (uint32_t i = 0; i < index; i++) { lit_utf8_incr (&input_curr_p); } } } - - ECMA_OP_TO_NUMBER_FINALIZE (lastindex_num); - - ecma_fast_free_value (lastindex_value); } - /* 2. Try to match */ - const lit_utf8_byte_t *sub_str_p = NULL; - uint8_t *bc_start_p = (uint8_t *) (bc_p + 1); + re_ctx.input_start_p = input_buffer_p; + const lit_utf8_byte_t *input_end_p = re_ctx.input_start_p + input_size; + re_ctx.input_end_p = input_end_p; - while (!ECMA_IS_VALUE_ERROR (ret_value)) + JERRY_TRACE_MSG ("Exec with flags [global: %d, ignoreCase: %d, multiline: %d]\n", + re_ctx.flags & RE_FLAG_GLOBAL, + re_ctx.flags & RE_FLAG_IGNORE_CASE, + re_ctx.flags & RE_FLAG_MULTILINE); + + re_ctx.captures_count = bc_p->captures_count; + re_ctx.captures_p = jmem_heap_alloc_block (re_ctx.captures_count * sizeof (ecma_regexp_capture_t)); + memset (re_ctx.captures_p, 0, re_ctx.captures_count * sizeof (ecma_regexp_capture_t)); + + re_ctx.non_captures_count = bc_p->non_captures_count; + re_ctx.non_captures_p = jmem_heap_alloc_block (re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t)); + memset (re_ctx.non_captures_p, 0, re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t)); + + const uint32_t iters_length = re_ctx.captures_count + re_ctx.non_captures_count - 1; + re_ctx.iterations_p = jmem_heap_alloc_block (iters_length * sizeof (uint32_t)); + memset (re_ctx.iterations_p, 0, iters_length * sizeof (uint32_t)); + + /* 2. Try to match */ + uint8_t *bc_start_p = (uint8_t *) (bc_p + 1); + const lit_utf8_byte_t *matched_p = NULL; + + JERRY_ASSERT (index <= input_length); + while (true) { - if (index < 0 || index > (int32_t) input_str_len) + matched_p = ecma_regexp_match (&re_ctx, bc_start_p, input_curr_p); + + if (matched_p != NULL) + { + break; + } + + index++; + if (index > input_length) { if (re_ctx.flags & RE_FLAG_GLOBAL) { ecma_value_t put_result = ecma_op_object_put (regexp_object_p, ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), - ecma_make_integer_value (0), + ecma_make_uint32_value (0), true); if (ECMA_IS_VALUE_ERROR (put_result)) { - ecma_free_value (ret_value); ret_value = put_result; + goto cleanup_context; } + + JERRY_ASSERT (ecma_is_value_boolean (put_result)); } - is_match = false; - break; + /* Failed to match, return 'null'. */ + ret_value = ECMA_VALUE_NULL; + goto cleanup_context; } - else - { - ret_value = re_match_regexp (&re_ctx, bc_start_p, input_curr_p, &sub_str_p); - if (ECMA_IS_VALUE_ERROR (ret_value)) - { - break; - } - if (ecma_is_value_true (ret_value)) - { - is_match = true; - break; - } - - if (input_curr_p < input_end_p) - { - lit_utf8_incr (&input_curr_p); - } - index++; - } + JERRY_ASSERT (input_curr_p < input_end_p); + lit_utf8_incr (&input_curr_p); } - if (!ECMA_IS_VALUE_ERROR (ret_value) && input_curr_p && (re_ctx.flags & RE_FLAG_GLOBAL)) - { - ecma_number_t lastindex_num; + JERRY_ASSERT (matched_p != NULL); - if (sub_str_p != NULL - && input_buffer_p != NULL) + if (ECMA_RE_STACK_LIMIT_REACHED (matched_p)) + { + ret_value = ecma_raise_range_error (ECMA_ERR_MSG ("Stack limit exceeded.")); + goto cleanup_context; + } + + if (re_ctx.flags & RE_FLAG_GLOBAL) + { + JERRY_ASSERT (index <= input_length); + + lit_utf8_size_t match_length; + const lit_utf8_byte_t *match_begin_p = re_ctx.captures_p[0].begin_p; + const lit_utf8_byte_t *match_end_p = re_ctx.captures_p[0].end_p; + + if (input_flags & ECMA_STRING_FLAG_IS_ASCII) { - if (input_str_len == input_buffer_size) - { - lastindex_num = (ecma_number_t) (sub_str_p - input_buffer_p); - } - else - { - lastindex_num = (ecma_number_t) lit_utf8_string_length (input_buffer_p, - (lit_utf8_size_t) (sub_str_p - input_buffer_p)); - } + match_length = (lit_utf8_size_t) (match_end_p - match_begin_p); } else { - lastindex_num = ECMA_NUMBER_ZERO; + match_length = lit_utf8_string_length (match_begin_p, + (lit_utf8_size_t) (match_end_p - match_begin_p)); } ecma_value_t put_result = ecma_op_object_put (regexp_object_p, ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), - ecma_make_number_value (lastindex_num), + ecma_make_uint32_value (index + match_length), true); - if (ECMA_IS_VALUE_ERROR (put_result)) { - ecma_free_value (ret_value); ret_value = put_result; + goto cleanup_context; } + + JERRY_ASSERT (ecma_is_value_boolean (put_result)); } - /* 3. Fill the result array or return with 'undefiend' */ - if (!ECMA_IS_VALUE_ERROR (ret_value)) + ret_value = ecma_regexp_create_result_object (&re_ctx, input_string_p, index); + +cleanup_context: + jmem_heap_free_block (re_ctx.captures_p, re_ctx.captures_count * sizeof (ecma_regexp_capture_t)); + if (re_ctx.non_captures_p != NULL) { - if (is_match) - { - ecma_value_t result_array = ecma_op_create_array_object (0, 0, false); - ecma_object_t *result_array_obj_p = ecma_get_object_from_value (result_array); - - ecma_string_t *input_str_p = ecma_new_ecma_string_from_utf8 (input_buffer_p, input_buffer_size); - re_set_result_array_properties (result_array_obj_p, input_str_p, re_ctx.num_of_captures / 2, index); - ecma_deref_ecma_string (input_str_p); - - for (uint32_t i = 0; i < re_ctx.num_of_captures; i += 2) - { - ecma_string_t *index_str_p = ecma_new_ecma_string_from_uint32 (i / 2); - ecma_value_t capture_value = ECMA_VALUE_UNDEFINED; - - if (((re_ctx.saved_p[i] && re_ctx.saved_p[i + 1]) - && re_ctx.saved_p[i + 1] >= re_ctx.saved_p[i])) - { - ecma_length_t capture_str_len; - capture_str_len = (ecma_length_t) (re_ctx.saved_p[i + 1] - re_ctx.saved_p[i]); - ecma_string_t *capture_str_p; - - if (capture_str_len > 0) - { - capture_str_p = ecma_new_ecma_string_from_utf8 (re_ctx.saved_p[i], capture_str_len); - } - else - { - capture_str_p = ecma_get_magic_string (LIT_MAGIC_STRING__EMPTY); - } - - capture_value = ecma_make_string_value (capture_str_p); - } - - JERRY_ASSERT (!((ecma_extended_object_t *) result_array_obj_p)->u.array.is_fast_mode); - - ecma_property_value_t *prop_value_p; - prop_value_p = ecma_create_named_data_property (result_array_obj_p, - index_str_p, - ECMA_PROPERTY_CONFIGURABLE_ENUMERABLE_WRITABLE, - NULL); - prop_value_p->value = capture_value; - - JERRY_ASSERT (!ecma_is_value_object (capture_value)); - ecma_deref_ecma_string (index_str_p); - } - - ret_value = result_array; - } - else - { - ret_value = ECMA_VALUE_NULL; - } + jmem_heap_free_block (re_ctx.non_captures_p, re_ctx.non_captures_count * sizeof (ecma_regexp_non_capture_t)); + } + if (re_ctx.iterations_p != NULL) + { + jmem_heap_free_block (re_ctx.iterations_p, iters_length * sizeof (uint32_t)); } - JMEM_FINALIZE_LOCAL_ARRAY (num_of_iter_p); - JMEM_FINALIZE_LOCAL_ARRAY (saved_p); - ECMA_FINALIZE_UTF8_STRING (input_buffer_p, input_buffer_size); +cleanup_string: + if (input_flags & ECMA_STRING_FLAG_MUST_BE_FREED) + { + jmem_heap_free_block ((void *) input_buffer_p, input_size); + } return ret_value; } /* ecma_regexp_exec_helper */ diff --git a/jerry-core/ecma/operations/ecma-regexp-object.h b/jerry-core/ecma/operations/ecma-regexp-object.h index 4a56b039a..1b1c6e547 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.h +++ b/jerry-core/ecma/operations/ecma-regexp-object.h @@ -35,34 +35,66 @@ */ typedef enum { + RE_FLAG_EMPTY = 0u, /* Empty RegExp flags */ RE_FLAG_GLOBAL = (1u << 1), /**< ECMA-262 v5, 15.10.7.2 */ RE_FLAG_IGNORE_CASE = (1u << 2), /**< ECMA-262 v5, 15.10.7.3 */ RE_FLAG_MULTILINE = (1u << 3) /**< ECMA-262 v5, 15.10.7.4 */ -} re_flags_t; +} ecma_regexp_flags_t; + +/** + * Structure for storing capturing group results + */ +typedef struct +{ + const lit_utf8_byte_t *begin_p; /**< substring start pointer */ + const lit_utf8_byte_t *end_p; /**< substring end pointer */ +} ecma_regexp_capture_t; + +/** + * Structure for storing non-capturing group results + */ +typedef struct +{ + const lit_utf8_byte_t *str_p; /**< string pointer */ +} ecma_regexp_non_capture_t; + +#if (JERRY_STACK_LIMIT != 0) +/** + * Value used ase result when stack limit is reached + */ +#define ECMA_RE_OUT_OF_STACK ((const lit_utf8_byte_t *) UINTPTR_MAX) + +/** + * Checks if the stack limit has been reached during regexp matching + */ +#define ECMA_RE_STACK_LIMIT_REACHED(p) (JERRY_UNLIKELY (p == ECMA_RE_OUT_OF_STACK)) +#else /* JERRY_STACK_LIMIT == 0 */ +#define ECMA_RE_STACK_LIMIT_REACHED(p) (false) +#endif /* JERRY_STACK_LIMIT != 0 */ /** * RegExp executor context */ typedef struct { - const lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */ - const lit_utf8_byte_t *input_start_p; /**< start of input pattern string */ - const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */ - uint32_t num_of_captures; /**< number of capture groups */ - uint32_t num_of_non_captures; /**< number of non-capture groups */ - uint32_t *num_of_iterations_p; /**< number of iterations */ - uint16_t flags; /**< RegExp flags */ -} re_matcher_ctx_t; + const lit_utf8_byte_t *input_end_p; /**< end of input string */ + const lit_utf8_byte_t *input_start_p; /**< start of input string */ + uint32_t captures_count; /**< number of capture groups */ + ecma_regexp_capture_t *captures_p; /**< capturing groups */ + uint32_t non_captures_count; /**< number of non-capture groups */ + ecma_regexp_non_capture_t *non_captures_p; /**< non-capturing groups */ + uint32_t *iterations_p; /**< number of iterations */ + uint16_t flags; /**< RegExp flags */ +} ecma_regexp_ctx_t; ecma_value_t ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *bytecode_p); ecma_value_t ecma_op_create_regexp_object (ecma_string_t *pattern_p, uint16_t flags); ecma_value_t ecma_regexp_exec_helper (ecma_value_t regexp_value, ecma_value_t input_string, bool ignore_global); ecma_value_t ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg, ecma_string_t **pattern_string_p); -ecma_char_t re_canonicalize (ecma_char_t ch, bool is_ignorecase); -void re_set_result_array_properties (ecma_object_t *array_obj_p, ecma_string_t *input_str_p, uint32_t num_of_elements, - int32_t index); -ecma_value_t re_parse_regexp_flags (ecma_string_t *flags_str_p, uint16_t *flags_p); -void re_initialize_props (ecma_object_t *re_obj_p, ecma_string_t *source_p, uint16_t flags); +ecma_char_t ecma_regexp_canonicalize (ecma_char_t ch, bool is_ignorecase); +ecma_char_t ecma_regexp_canonicalize_char (ecma_char_t ch); +ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p); +void ecma_regexp_initialize_props (ecma_object_t *re_obj_p, ecma_string_t *source_p, uint16_t flags); /** * @} diff --git a/jerry-core/lit/lit-magic-strings.inc.h b/jerry-core/lit/lit-magic-strings.inc.h index 64c341de8..be61e7810 100644 --- a/jerry-core/lit/lit-magic-strings.inc.h +++ b/jerry-core/lit/lit-magic-strings.inc.h @@ -26,9 +26,6 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_RIGHT_PAREN, ")") LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_ASTERIX_CHAR, "*") #endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_COMMA_CHAR, ",") -#if ENABLED (JERRY_BUILTIN_REGEXP) -LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SLASH_CHAR, "/") -#endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_COLON_CHAR, ":") #if ENABLED (JERRY_BUILTIN_MATH) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_E_U, "E") @@ -207,6 +204,9 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_CLEAR, "clear") LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_EVERY, "every") #endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_FALSE, "false") +#if ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) +LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_FLAGS, "flags") +#endif #if ENABLED (JERRY_BUILTIN_MATH) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_FLOOR, "floor") #endif @@ -290,7 +290,10 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_FREEZE, "freeze") #if ENABLED (JERRY_BUILTIN_DATE) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_GET_DAY_UL, "getDay") #endif -#if ENABLED (JERRY_BUILTIN_REGEXP) +#if !ENABLED (JERRY_ES2015) && ENABLED (JERRY_BUILTIN_REGEXP) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_BUILTIN_STRING) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && !( ENABLED (JERRY_ES2015)) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_GLOBAL, "global") #endif #if ENABLED (JERRY_ES2015_BUILTIN_TYPEDARRAY) @@ -319,7 +322,9 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_REPEAT, "repeat") || ENABLED (JERRY_ES2015_BUILTIN_SYMBOL) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SEARCH, "search") #endif -#if ENABLED (JERRY_BUILTIN_REGEXP) +#if !ENABLED (JERRY_ES2015) && ENABLED (JERRY_BUILTIN_REGEXP) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && !( ENABLED (JERRY_ES2015)) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SOURCE, "source") #endif #if ENABLED (JERRY_BUILTIN_ARRAY) @@ -518,6 +523,10 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_IS_INTEGER, "isInteger") #endif #if ENABLED (JERRY_BUILTIN_REGEXP) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_LASTINDEX_UL, "lastIndex") +#endif +#if !ENABLED (JERRY_ES2015) && ENABLED (JERRY_BUILTIN_REGEXP) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && !( ENABLED (JERRY_ES2015)) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_MULTILINE, "multiline") #endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_PROTOTYPE, "prototype") @@ -566,7 +575,9 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_GET_MINUTES_UL, "getMinutes") LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_GET_SECONDS_UL, "getSeconds") LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_GET_UTC_DATE_UL, "getUTCDate") #endif -#if ENABLED (JERRY_BUILTIN_REGEXP) +#if !ENABLED (JERRY_ES2015) && ENABLED (JERRY_BUILTIN_REGEXP) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && !( ENABLED (JERRY_ES2015)) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_IGNORECASE_UL, "ignoreCase") #endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_PARSE_FLOAT, "parseFloat") diff --git a/jerry-core/lit/lit-magic-strings.ini b/jerry-core/lit/lit-magic-strings.ini index ec1f5e739..6662ffcf5 100644 --- a/jerry-core/lit/lit-magic-strings.ini +++ b/jerry-core/lit/lit-magic-strings.ini @@ -27,7 +27,6 @@ LIT_MAGIC_STRING_ASTERIX_CHAR = "*" LIT_MAGIC_STRING_SPACE_CHAR = " " LIT_MAGIC_STRING_RIGHT_PAREN = ")" LIT_MAGIC_STRING_COMMA_CHAR = "," -LIT_MAGIC_STRING_SLASH_CHAR = "/" LIT_MAGIC_STRING_COLON_CHAR = ":" LIT_MAGIC_STRING_E_U = "E" LIT_MAGIC_STRING_LEFT_SQUARE_CHAR = "[" @@ -102,6 +101,7 @@ LIT_MAGIC_STRING_CATCH = "catch" LIT_MAGIC_STRING_CLEAR = "clear" LIT_MAGIC_STRING_EVERY = "every" LIT_MAGIC_STRING_FALSE = "false" +LIT_MAGIC_STRING_FLAGS = "flags" LIT_MAGIC_STRING_FLOOR = "floor" LIT_MAGIC_STRING_INDEX = "index" LIT_MAGIC_STRING_INPUT = "input" diff --git a/jerry-core/parser/regexp/re-bytecode.c b/jerry-core/parser/regexp/re-bytecode.c index 2b85039ad..7d7028aaf 100644 --- a/jerry-core/parser/regexp/re-bytecode.c +++ b/jerry-core/parser/regexp/re-bytecode.c @@ -30,8 +30,18 @@ /** * Size of block of RegExp bytecode. Used for allocation + * + * @return pointer to the RegExp compiled code header */ -#define REGEXP_BYTECODE_BLOCK_SIZE 256UL +#define REGEXP_BYTECODE_BLOCK_SIZE 64UL + +void +re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +{ + bc_ctx_p->block_start_p = jmem_heap_alloc_block (REGEXP_BYTECODE_BLOCK_SIZE); + bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + REGEXP_BYTECODE_BLOCK_SIZE; + bc_ctx_p->current_p = bc_ctx_p->block_start_p + sizeof (re_compiled_code_t); +} /* re_initialize_regexp_bytecode */ /** * Realloc the bytecode container @@ -42,26 +52,22 @@ static uint8_t * re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ { JERRY_ASSERT (bc_ctx_p->block_end_p >= bc_ctx_p->block_start_p); - size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p); + const size_t old_size = (size_t) (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p); /* If one of the members of RegExp bytecode context is NULL, then all member should be NULL * (it means first allocation), otherwise all of the members should be a non NULL pointer. */ JERRY_ASSERT ((!bc_ctx_p->current_p && !bc_ctx_p->block_end_p && !bc_ctx_p->block_start_p) || (bc_ctx_p->current_p && bc_ctx_p->block_end_p && bc_ctx_p->block_start_p)); - size_t new_block_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE; + const size_t new_size = old_size + REGEXP_BYTECODE_BLOCK_SIZE; JERRY_ASSERT (bc_ctx_p->current_p >= bc_ctx_p->block_start_p); - size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p); + const size_t current_ptr_offset = (size_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p); - uint8_t *new_block_start_p = (uint8_t *) jmem_heap_alloc_block (new_block_size); - if (bc_ctx_p->current_p) - { - memcpy (new_block_start_p, bc_ctx_p->block_start_p, (size_t) (current_ptr_offset)); - jmem_heap_free_block (bc_ctx_p->block_start_p, old_size); - } - bc_ctx_p->block_start_p = new_block_start_p; - bc_ctx_p->block_end_p = new_block_start_p + new_block_size; - bc_ctx_p->current_p = new_block_start_p + current_ptr_offset; + bc_ctx_p->block_start_p = jmem_heap_realloc_block (bc_ctx_p->block_start_p, + old_size, + new_size); + bc_ctx_p->block_end_p = bc_ctx_p->block_start_p + new_size; + bc_ctx_p->current_p = bc_ctx_p->block_start_p + current_ptr_offset; return bc_ctx_p->current_p; } /* re_realloc_regexp_bytecode_block */ @@ -69,54 +75,71 @@ re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytec /** * Append a new bytecode to the and of the bytecode container */ -static void -re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint8_t *bytecode_p, /**< input bytecode */ - size_t length) /**< length of input */ +static uint8_t * +re_bytecode_reserve (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + const size_t size) /**< size */ { - JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); + JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE); uint8_t *current_p = bc_ctx_p->current_p; - if (current_p + length > bc_ctx_p->block_end_p) + if (current_p + size > bc_ctx_p->block_end_p) { current_p = re_realloc_regexp_bytecode_block (bc_ctx_p); } - memcpy (current_p, bytecode_p, length); - bc_ctx_p->current_p += length; -} /* re_bytecode_list_append */ + bc_ctx_p->current_p += size; + return current_p; +} /* re_bytecode_reserve */ /** * Insert a new bytecode to the bytecode container */ -void -re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - size_t offset, /**< distance from the start of the container */ - uint8_t *bytecode_p, /**< input bytecode */ - size_t length) /**< length of input */ +static void +re_bytecode_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + const size_t offset, /**< distance from the start of the container */ + const size_t size) /**< size */ { - JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); + JERRY_ASSERT (size <= REGEXP_BYTECODE_BLOCK_SIZE); uint8_t *current_p = bc_ctx_p->current_p; - if (current_p + length > bc_ctx_p->block_end_p) + if (current_p + size > bc_ctx_p->block_end_p) { re_realloc_regexp_bytecode_block (bc_ctx_p); } - uint8_t *src_p = bc_ctx_p->block_start_p + offset; - if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0) + uint8_t *dest_p = bc_ctx_p->block_start_p + offset; + const size_t bytecode_length = re_get_bytecode_length (bc_ctx_p); + if (bytecode_length - offset > 0) { - uint8_t *dest_p = src_p + length; - uint8_t *tmp_block_start_p; - tmp_block_start_p = (uint8_t *) jmem_heap_alloc_block (re_get_bytecode_length (bc_ctx_p) - offset); - memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); - memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); - jmem_heap_free_block (tmp_block_start_p, re_get_bytecode_length (bc_ctx_p) - offset); + memmove (dest_p + size, dest_p, bytecode_length - offset); } - memcpy (src_p, bytecode_p, length); - bc_ctx_p->current_p += length; -} /* re_bytecode_list_insert */ + bc_ctx_p->current_p += size; +} /* re_bytecode_insert */ + +/** + * Encode ecma_char_t into bytecode + */ +static void +re_encode_char (uint8_t *dest_p, /**< destination */ + const ecma_char_t c) /**< character */ +{ + *dest_p++ = (uint8_t) ((c >> 8) & 0xFF); + *dest_p = (uint8_t) (c & 0xFF); +} /* re_encode_char */ + +/** + * Encode uint32_t into bytecode + */ +static void +re_encode_u32 (uint8_t *dest_p, /**< destination */ + const uint32_t u) /**< uint32 value */ +{ + *dest_p++ = (uint8_t) ((u >> 24) & 0xFF); + *dest_p++ = (uint8_t) ((u >> 16) & 0xFF); + *dest_p++ = (uint8_t) ((u >> 8) & 0xFF); + *dest_p = (uint8_t) (u & 0xFF); +} /* re_encode_u32 */ /** * Get a character from the RegExp bytecode and increase the bytecode position @@ -124,10 +147,12 @@ re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode conte * @return ecma character */ inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE -re_get_char (uint8_t **bc_p) /**< pointer to bytecode start */ +re_get_char (const uint8_t **bc_p) /**< pointer to bytecode start */ { - ecma_char_t chr; - memcpy (&chr, *bc_p, sizeof (ecma_char_t)); + const uint8_t *src_p = *bc_p; + ecma_char_t chr = (ecma_char_t) *src_p++; + chr = (ecma_char_t) (chr << 8); + chr = (ecma_char_t) (chr | *src_p); (*bc_p) += sizeof (ecma_char_t); return chr; } /* re_get_char */ @@ -138,11 +163,9 @@ re_get_char (uint8_t **bc_p) /**< pointer to bytecode start */ * @return current RegExp opcode */ inline re_opcode_t JERRY_ATTR_ALWAYS_INLINE -re_get_opcode (uint8_t **bc_p) /**< pointer to bytecode start */ +re_get_opcode (const uint8_t **bc_p) /**< pointer to bytecode start */ { - uint8_t bytecode = **bc_p; - (*bc_p) += sizeof (uint8_t); - return (re_opcode_t) bytecode; + return (re_opcode_t) *((*bc_p)++); } /* re_get_opcode */ /** @@ -151,10 +174,17 @@ re_get_opcode (uint8_t **bc_p) /**< pointer to bytecode start */ * @return opcode parameter */ inline uint32_t JERRY_ATTR_ALWAYS_INLINE -re_get_value (uint8_t **bc_p) /**< pointer to bytecode start */ +re_get_value (const uint8_t **bc_p) /**< pointer to bytecode start */ { - uint32_t value; - memcpy (&value, *bc_p, sizeof (uint32_t)); + const uint8_t *src_p = *bc_p; + uint32_t value = (uint32_t) (*src_p++); + value <<= 8; + value |= ((uint32_t) (*src_p++)); + value <<= 8; + value |= ((uint32_t) (*src_p++)); + value <<= 8; + value |= ((uint32_t) (*src_p++)); + (*bc_p) += sizeof (uint32_t); return value; } /* re_get_value */ @@ -175,9 +205,10 @@ re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode contex */ void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - re_opcode_t opcode) /**< input opcode */ + const re_opcode_t opcode) /**< input opcode */ { - re_bytecode_list_append (bc_ctx_p, (uint8_t *) &opcode, sizeof (uint8_t)); + uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint8_t)); + *dest_p = (uint8_t) opcode; } /* re_append_opcode */ /** @@ -185,9 +216,10 @@ re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ */ void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t value) /**< input value */ + const uint32_t value) /**< input value */ { - re_bytecode_list_append (bc_ctx_p, (uint8_t *) &value, sizeof (uint32_t)); + uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (uint32_t)); + re_encode_u32 (dest_p, value); } /* re_append_u32 */ /** @@ -195,9 +227,10 @@ re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ */ void re_append_char (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - ecma_char_t input_char) /**< input char */ + const ecma_char_t input_char) /**< input char */ { - re_bytecode_list_append (bc_ctx_p, (uint8_t *) &input_char, sizeof (ecma_char_t)); + uint8_t *dest_p = re_bytecode_reserve (bc_ctx_p, sizeof (ecma_char_t)); + re_encode_char (dest_p, input_char); } /* re_append_char */ /** @@ -216,10 +249,11 @@ re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t offset, /**< distance from the start of the container */ - re_opcode_t opcode) /**< input opcode */ + const uint32_t offset, /**< distance from the start of the container */ + const re_opcode_t opcode) /**< input opcode */ { - re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &opcode, sizeof (uint8_t)); + re_bytecode_insert (bc_ctx_p, offset, sizeof (uint8_t)); + *(bc_ctx_p->block_start_p + offset) = (uint8_t) opcode; } /* re_insert_opcode */ /** @@ -230,7 +264,8 @@ re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ uint32_t offset, /**< distance from the start of the container */ uint32_t value) /**< input value */ { - re_bytecode_list_insert (bc_ctx_p, offset, (uint8_t *) &value, sizeof (uint32_t)); + re_bytecode_insert (bc_ctx_p, offset, sizeof (uint32_t)); + re_encode_u32 (bc_ctx_p->block_start_p + offset, value); } /* re_insert_u32 */ #if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) @@ -242,10 +277,10 @@ re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ { re_compiled_code_t *compiled_code_p = (re_compiled_code_t *) bc_ctx_p->block_start_p; JERRY_DEBUG_MSG ("%d ", compiled_code_p->header.status_flags); - JERRY_DEBUG_MSG ("%d ", compiled_code_p->num_of_captures); - JERRY_DEBUG_MSG ("%d | ", compiled_code_p->num_of_non_captures); + JERRY_DEBUG_MSG ("%d ", compiled_code_p->captures_count); + JERRY_DEBUG_MSG ("%d | ", compiled_code_p->non_captures_count); - uint8_t *bytecode_p = (uint8_t *) (compiled_code_p + 1); + const uint8_t *bytecode_p = (const uint8_t *) (compiled_code_p + 1); re_opcode_t op; while ((op = re_get_opcode (&bytecode_p))) diff --git a/jerry-core/parser/regexp/re-bytecode.h b/jerry-core/parser/regexp/re-bytecode.h index 4789bc259..715170bb1 100644 --- a/jerry-core/parser/regexp/re-bytecode.h +++ b/jerry-core/parser/regexp/re-bytecode.h @@ -85,9 +85,9 @@ typedef enum typedef struct { ecma_compiled_code_t header; /**< compiled code header */ - ecma_value_t pattern; /**< original RegExp pattern */ - uint32_t num_of_captures; /**< number of capturing brackets */ - uint32_t num_of_non_captures; /**< number of non capturing brackets */ + ecma_value_t source; /**< original RegExp pattern */ + uint32_t captures_count; /**< number of capturing brackets */ + uint32_t non_captures_count; /**< number of non capturing brackets */ } re_compiled_code_t; /** @@ -100,19 +100,24 @@ typedef struct uint8_t *current_p; /**< current position in bytecode */ } re_bytecode_ctx_t; -re_opcode_t re_get_opcode (uint8_t **bc_p); -ecma_char_t re_get_char (uint8_t **bc_p); -uint32_t re_get_value (uint8_t **bc_p); +re_opcode_t re_get_opcode (const uint8_t **bc_p); +ecma_char_t re_get_char (const uint8_t **bc_p); +uint32_t re_get_value (const uint8_t **bc_p); uint32_t JERRY_ATTR_PURE re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p); -void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, re_opcode_t opcode); -void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, uint32_t value); -void re_append_char (re_bytecode_ctx_t *bc_ctx_p, ecma_char_t input_char); +void re_initialize_regexp_bytecode (re_bytecode_ctx_t *bc_ctx_p); + +void re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, const re_opcode_t opcode); +void re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t value); +void re_append_char (re_bytecode_ctx_t *bc_ctx_p, const ecma_char_t input_char); void re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, uint32_t value); -void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, uint32_t offset, re_opcode_t opcode); -void re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, uint32_t offset, uint32_t value); -void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, size_t offset, uint8_t *bytecode_p, size_t length); +void re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const re_opcode_t opcode); +void re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, const uint32_t offset, const uint32_t value); +void re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, + const size_t offset, + const uint8_t *bytecode_p, + const size_t length); #if ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) void re_dump_bytecode (re_bytecode_ctx_t *bc_ctx); diff --git a/jerry-core/parser/regexp/re-compiler.c b/jerry-core/parser/regexp/re-compiler.c index d6b8071fa..971da567c 100644 --- a/jerry-core/parser/regexp/re-compiler.c +++ b/jerry-core/parser/regexp/re-compiler.c @@ -17,6 +17,7 @@ #include "ecma-helpers.h" #include "ecma-regexp-object.h" #include "ecma-try-catch-macro.h" +#include "lit-char-helpers.h" #include "jcontext.h" #include "jrt-libc-includes.h" #include "jmem.h" @@ -36,20 +37,6 @@ * @{ */ -/** - * Callback function of character class generation - */ -static void -re_append_char_class (void *re_ctx_p, /**< RegExp compiler context */ - ecma_char_t start, /**< character class range from */ - ecma_char_t end) /**< character class range to */ -{ - re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t *) re_ctx_p; - re_append_char (ctx_p->bytecode_ctx_p, start); - re_append_char (ctx_p->bytecode_ctx_p, end); - ctx_p->parser_ctx_p->num_of_classes++; -} /* re_append_char_class */ - /** * Insert simple atom iterator * @@ -234,6 +221,276 @@ re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compile return re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); } /* re_insert_into_group_with_jump */ +/** + * Append a character class range to the bytecode + */ +static void +re_append_char_class (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + ecma_char_t start, /**< character class range from */ + ecma_char_t end) /**< character class range to */ +{ + re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_ctx_p->parser_ctx_p->classes_count++; +} /* re_append_char_class */ + +/** + * Read the input pattern and parse the range of character class + * + * @return empty ecma value - if parsed successfully + * error ecma value - otherwise + * + * Returned value must be freed with ecma_free_value + */ +static ecma_value_t +re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ + re_token_t *out_token_p) /**< [out] output token */ +{ + re_parser_ctx_t *const parser_ctx_p = re_ctx_p->parser_ctx_p; + out_token_p->qmax = out_token_p->qmin = 1; + parser_ctx_p->classes_count = 0; + + ecma_char_t start = LIT_CHAR_UNDEF; + bool is_range = false; + const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS + || re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS); + + const ecma_char_t prev_char = lit_utf8_peek_prev (parser_ctx_p->input_curr_p); + if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX) + { + lit_utf8_decr (&parser_ctx_p->input_curr_p); + lit_utf8_decr (&parser_ctx_p->input_curr_p); + } + + do + { + if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string")); + } + + ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + + if (ch == LIT_CHAR_RIGHT_SQUARE) + { + if (start != LIT_CHAR_UNDEF) + { + re_append_char_class (re_ctx_p, start, start); + } + break; + } + else if (ch == LIT_CHAR_MINUS) + { + if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'")); + } + + if (start != LIT_CHAR_UNDEF + && !is_range + && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE) + { + is_range = true; + continue; + } + } + else if (ch == LIT_CHAR_BACKSLASH) + { + if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'")); + } + + ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + + if (ch == LIT_CHAR_LOWERCASE_B) + { + ch = LIT_CHAR_BS; + } + else if (ch == LIT_CHAR_LOWERCASE_F) + { + ch = LIT_CHAR_FF; + } + else if (ch == LIT_CHAR_LOWERCASE_N) + { + ch = LIT_CHAR_LF; + } + else if (ch == LIT_CHAR_LOWERCASE_T) + { + ch = LIT_CHAR_TAB; + } + else if (ch == LIT_CHAR_LOWERCASE_R) + { + ch = LIT_CHAR_CR; + } + else if (ch == LIT_CHAR_LOWERCASE_V) + { + ch = LIT_CHAR_VTAB; + } + else if (ch == LIT_CHAR_LOWERCASE_C) + { + if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) + { + ch = *parser_ctx_p->input_curr_p; + + if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) + || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END) + || (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9)) + { + /* See ECMA-262 v5, 15.10.2.10 (Point 3) */ + ch = (ch % 32); + parser_ctx_p->input_curr_p++; + } + else + { + ch = LIT_CHAR_LOWERCASE_C; + } + } + } + else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2)) + { + ecma_char_t code_unit; + + if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit)) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'")); + } + + parser_ctx_p->input_curr_p += 2; + if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p + && is_range == false + && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) + { + start = code_unit; + continue; + } + + ch = code_unit; + } + else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4)) + { + ecma_char_t code_unit; + + if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit)) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'")); + } + + parser_ctx_p->input_curr_p += 4; + if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p + && is_range == false + && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) + { + start = code_unit; + continue; + } + + ch = code_unit; + } + else if (ch == LIT_CHAR_LOWERCASE_D) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END); + ch = LIT_CHAR_UNDEF; + } + else if (ch == LIT_CHAR_UPPERCASE_D) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX); + ch = LIT_CHAR_UNDEF; + } + else if (ch == LIT_CHAR_LOWERCASE_S) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR); + re_append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP); + re_append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP); + re_append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */ + re_append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */ + re_append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */ + re_append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS); + re_append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */ + re_append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */ + re_append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */ + re_append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM); + ch = LIT_CHAR_UNDEF; + } + else if (ch == LIT_CHAR_UPPERCASE_S) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL); + re_append_char_class (re_ctx_p, 0x1681UL, 0x180DUL); + re_append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL); + re_append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL); + re_append_char_class (re_ctx_p, 0x2030UL, 0x205EUL); + re_append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL); + re_append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX); + ch = LIT_CHAR_UNDEF; + } + else if (ch == LIT_CHAR_LOWERCASE_W) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9); + re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z); + re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE); + re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z); + ch = LIT_CHAR_UNDEF; + } + else if (ch == LIT_CHAR_UPPERCASE_W) + { + /* See ECMA-262 v5, 15.10.2.12 */ + re_append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1); + re_append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX); + ch = LIT_CHAR_UNDEF; + } + else if (lit_char_is_octal_digit ((ecma_char_t) ch) + && ch != LIT_CHAR_0) + { + lit_utf8_decr (&parser_ctx_p->input_curr_p); + ch = (ecma_char_t) re_parse_octal (parser_ctx_p); + } + } /* ch == LIT_CHAR_BACKSLASH */ + + if (start != LIT_CHAR_UNDEF) + { + if (is_range) + { + if (start > ch) + { + return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order")); + } + else + { + re_append_char_class (re_ctx_p, start, ch); + start = LIT_CHAR_UNDEF; + is_range = false; + } + } + else + { + re_append_char_class (re_ctx_p, start, start); + start = ch; + } + } + else + { + start = ch; + } + } + while (is_char_class); + + return re_parse_iterator (parser_ctx_p, out_token_p); +} /* re_parse_char_class */ + /** * Parse alternatives * @@ -251,18 +508,17 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p; ecma_value_t ret_value = ECMA_VALUE_EMPTY; - uint32_t alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - bool should_loop = true; + uint32_t alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - while (ecma_is_value_empty (ret_value) && should_loop) + while (ecma_is_value_empty (ret_value)) { ecma_value_t next_token_result = re_parse_next_token (re_ctx_p->parser_ctx_p, &(re_ctx_p->current_token)); if (ECMA_IS_VALUE_ERROR (next_token_result)) { - ret_value = next_token_result; - break; + return next_token_result; } + JERRY_ASSERT (ecma_is_value_empty (next_token_result)); uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); @@ -271,7 +527,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context { case RE_TOK_START_CAPTURE_GROUP: { - idx = re_ctx_p->num_of_captures++; + idx = re_ctx_p->captures_count++; JERRY_TRACE_MSG ("Compile a capture group start (idx: %u)\n", (unsigned int) idx); ret_value = re_parse_alternative (re_ctx_p, false); @@ -285,7 +541,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context } case RE_TOK_START_NON_CAPTURE_GROUP: { - idx = re_ctx_p->num_of_non_captures++; + idx = re_ctx_p->non_captures_count++; JERRY_TRACE_MSG ("Compile a non-capture group start (idx: %u)\n", (unsigned int) idx); ret_value = re_parse_alternative (re_ctx_p, false); @@ -304,8 +560,8 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context (unsigned int) re_ctx_p->current_token.qmax); re_append_opcode (bc_ctx_p, RE_OP_CHAR); - re_append_char (bc_ctx_p, re_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, - re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_append_char (bc_ctx_p, ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, + re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); break; @@ -321,9 +577,9 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context case RE_TOK_ALTERNATIVE: { JERRY_TRACE_MSG ("Compile an alternative\n"); - re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); + re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); - alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); + alternative_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); break; } case RE_TOK_ASSERT_START: @@ -353,7 +609,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context case RE_TOK_ASSERT_START_POS_LOOKAHEAD: { JERRY_TRACE_MSG ("Compile a positive lookahead assertion\n"); - idx = re_ctx_p->num_of_non_captures++; + idx = re_ctx_p->non_captures_count++; re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); ret_value = re_parse_alternative (re_ctx_p, false); @@ -370,7 +626,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context case RE_TOK_ASSERT_START_NEG_LOOKAHEAD: { JERRY_TRACE_MSG ("Compile a negative lookahead assertion\n"); - idx = re_ctx_p->num_of_non_captures++; + idx = re_ctx_p->non_captures_count++; re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); ret_value = re_parse_alternative (re_ctx_p, false); @@ -387,7 +643,7 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context case RE_TOK_BACKREFERENCE: { uint32_t backref = (uint32_t) re_ctx_p->current_token.value; - idx = re_ctx_p->num_of_non_captures++; + idx = re_ctx_p->non_captures_count++; if (backref > re_ctx_p->highest_backref) { @@ -417,14 +673,12 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context : RE_OP_CHAR_CLASS); uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); - ret_value = re_parse_char_class (re_ctx_p->parser_ctx_p, - re_append_char_class, - re_ctx_p, + ret_value = re_parse_char_class (re_ctx_p, &(re_ctx_p->current_token)); if (!ECMA_IS_VALUE_ERROR (ret_value)) { - re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes); + re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->classes_count); ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } @@ -436,33 +690,25 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context if (expect_eof) { - ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of paren.")); + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of paren.")); } - else - { - re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); - should_loop = false; - } - break; + + re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); + return ECMA_VALUE_EMPTY; } case RE_TOK_EOF: { if (!expect_eof) { - ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern.")); - } - else - { - re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); - should_loop = false; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected end of pattern.")); } - break; + re_insert_u32 (bc_ctx_p, alternative_offset, re_get_bytecode_length (bc_ctx_p) - alternative_offset); + return ECMA_VALUE_EMPTY; } default: { - ret_value = ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected RegExp token.")); - break; + return ecma_raise_syntax_error (ECMA_ERR_MSG ("Unexpected RegExp token.")); } } } @@ -488,7 +734,7 @@ re_find_bytecode_in_cache (ecma_string_t *pattern_str_p, /**< pattern string */ if (cached_bytecode_p != NULL) { - ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->pattern); + ecma_string_t *cached_pattern_str_p = ecma_get_string_from_value (cached_bytecode_p->source); if ((cached_bytecode_p->header.status_flags & RE_FLAGS_MASK) == flags && ecma_compare_ecma_strings (cached_pattern_str_p, pattern_str_p)) @@ -559,14 +805,11 @@ re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] point re_compiler_ctx_t re_ctx; re_ctx.flags = flags; re_ctx.highest_backref = 0; - re_ctx.num_of_non_captures = 0; + re_ctx.non_captures_count = 0; re_bytecode_ctx_t bc_ctx; - bc_ctx.block_start_p = NULL; - bc_ctx.block_end_p = NULL; - bc_ctx.current_p = NULL; - re_ctx.bytecode_ctx_p = &bc_ctx; + re_initialize_regexp_bytecode (&bc_ctx); ECMA_STRING_TO_UTF8_STRING (pattern_str_p, pattern_start_p, pattern_start_size); @@ -574,23 +817,23 @@ re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] point parser_ctx.input_start_p = pattern_start_p; parser_ctx.input_curr_p = (lit_utf8_byte_t *) pattern_start_p; parser_ctx.input_end_p = pattern_start_p + pattern_start_size; - parser_ctx.num_of_groups = -1; + parser_ctx.groups_count = -1; re_ctx.parser_ctx_p = &parser_ctx; - /* 1. Parse RegExp pattern */ - re_ctx.num_of_captures = 1; + /* Parse RegExp pattern */ + re_ctx.captures_count = 1; re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START); - ecma_value_t parse_alt_result = re_parse_alternative (&re_ctx, true); + ecma_value_t result = re_parse_alternative (&re_ctx, true); ECMA_FINALIZE_UTF8_STRING (pattern_start_p, pattern_start_size); - if (ECMA_IS_VALUE_ERROR (parse_alt_result)) + if (ECMA_IS_VALUE_ERROR (result)) { - ret_value = parse_alt_result; + ret_value = result; } - /* 2. Check for invalid backreference */ - else if (re_ctx.highest_backref >= re_ctx.num_of_captures) + /* Check for invalid backreference */ + else if (re_ctx.highest_backref >= re_ctx.captures_count) { ret_value = ecma_raise_syntax_error ("Invalid backreference.\n"); } @@ -599,20 +842,14 @@ re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] point re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH); re_append_opcode (&bc_ctx, RE_OP_EOF); - /* 3. Insert extra informations for bytecode header */ - re_compiled_code_t re_compiled_code; - - re_compiled_code.header.refs = 1; - re_compiled_code.header.status_flags = re_ctx.flags; + /* Initialize bytecode header */ + re_compiled_code_t *re_compiled_code_p = (re_compiled_code_t *) bc_ctx.block_start_p; + re_compiled_code_p->header.refs = 1; + re_compiled_code_p->header.status_flags = re_ctx.flags; ecma_ref_ecma_string (pattern_str_p); - re_compiled_code.pattern = ecma_make_string_value (pattern_str_p); - re_compiled_code.num_of_captures = re_ctx.num_of_captures * 2; - re_compiled_code.num_of_non_captures = re_ctx.num_of_non_captures; - - re_bytecode_list_insert (&bc_ctx, - 0, - (uint8_t *) &re_compiled_code, - sizeof (re_compiled_code_t)); + re_compiled_code_p->source = ecma_make_string_value (pattern_str_p); + re_compiled_code_p->captures_count = re_ctx.captures_count; + re_compiled_code_p->non_captures_count = re_ctx.non_captures_count; } size_t byte_code_size = (size_t) (bc_ctx.block_end_p - bc_ctx.block_start_p); @@ -633,10 +870,7 @@ re_compile_bytecode (const re_compiled_code_t **out_bytecode_p, /**< [out] point } #endif /* ENABLED (JERRY_REGEXP_DUMP_BYTE_CODE) */ - /* The RegExp bytecode contains at least a RE_OP_SAVE_AT_START opdoce, so it cannot be NULL. */ - JERRY_ASSERT (bc_ctx.block_start_p != NULL); *out_bytecode_p = (re_compiled_code_t *) bc_ctx.block_start_p; - ((re_compiled_code_t *) bc_ctx.block_start_p)->header.size = (uint16_t) (byte_code_size >> JMEM_ALIGNMENT_LOG); if (cache_idx == RE_CACHE_SIZE) diff --git a/jerry-core/parser/regexp/re-compiler.h b/jerry-core/parser/regexp/re-compiler.h index df8194edc..8dd2a72e1 100644 --- a/jerry-core/parser/regexp/re-compiler.h +++ b/jerry-core/parser/regexp/re-compiler.h @@ -38,8 +38,8 @@ typedef struct { uint16_t flags; /**< RegExp flags */ - uint32_t num_of_captures; /**< number of capture groups */ - uint32_t num_of_non_captures; /**< number of non-capture groups */ + uint32_t captures_count; /**< number of capture groups */ + uint32_t non_captures_count; /**< number of non-capture groups */ uint32_t highest_backref; /**< highest backreference */ re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */ re_token_t current_token; /**< current token */ diff --git a/jerry-core/parser/regexp/re-parser.c b/jerry-core/parser/regexp/re-parser.c index dad0b2190..c766fd450 100644 --- a/jerry-core/parser/regexp/re-parser.c +++ b/jerry-core/parser/regexp/re-parser.c @@ -40,26 +40,26 @@ * @return true - if lookup number of characters ahead are hex digits * false - otherwise */ -static bool +bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ uint32_t lookup) /**< size of lookup */ { - bool is_digit = true; const lit_utf8_byte_t *curr_p = parser_ctx_p->input_curr_p; - for (uint32_t i = 0; is_digit && i < lookup; i++) + if (JERRY_UNLIKELY (curr_p + lookup > parser_ctx_p->input_end_p)) { - if (curr_p < parser_ctx_p->input_end_p) - { - is_digit = lit_char_is_hex_digit (*curr_p++); - } - else + return false; + } + + for (uint32_t i = 0; i < lookup; i++) + { + if (!lit_char_is_hex_digit (*curr_p++)) { return false; } } - return is_digit; + return true; } /* re_hex_lookup */ /** @@ -86,7 +86,7 @@ re_parse_non_greedy_char (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser cont * * @return uint32_t - parsed octal number */ -static uint32_t +uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ { uint32_t number = 0; @@ -110,7 +110,7 @@ re_parse_octal (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ * * Returned value must be freed with ecma_free_value */ -static ecma_value_t +ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context */ re_token_t *re_token_p) /**< [out] output token */ { @@ -253,7 +253,7 @@ static void re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ { int char_class_in = 0; - parser_ctx_p->num_of_groups = 0; + parser_ctx_p->groups_count = 0; const lit_utf8_byte_t *curr_p = parser_ctx_p->input_start_p; while (curr_p < parser_ctx_p->input_end_p) @@ -287,7 +287,7 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex && *curr_p != LIT_CHAR_QUESTION && !char_class_in) { - parser_ctx_p->num_of_groups++; + parser_ctx_p->groups_count++; } break; } @@ -295,264 +295,6 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex } } /* re_count_num_of_groups */ -/** - * Read the input pattern and parse the range of character class - * - * @return empty ecma value - if parsed successfully - * error ecma value - otherwise - * - * Returned value must be freed with ecma_free_value - */ -ecma_value_t -re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ - re_char_class_callback append_char_class, /**< callback function, - * which adds the char-ranges - * to the bytecode */ - void *re_ctx_p, /**< regexp compiler context */ - re_token_t *out_token_p) /**< [out] output token */ -{ - re_token_type_t token_type = ((re_compiler_ctx_t *) re_ctx_p)->current_token.type; - out_token_p->qmax = out_token_p->qmin = 1; - ecma_char_t start = LIT_CHAR_UNDEF; - bool is_range = false; - parser_ctx_p->num_of_classes = 0; - - const ecma_char_t prev_char = lit_utf8_peek_prev (parser_ctx_p->input_curr_p); - if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX) - { - lit_utf8_decr (&parser_ctx_p->input_curr_p); - lit_utf8_decr (&parser_ctx_p->input_curr_p); - } - - do - { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string")); - } - - ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); - - if (ch == LIT_CHAR_RIGHT_SQUARE) - { - if (start != LIT_CHAR_UNDEF) - { - append_char_class (re_ctx_p, start, start); - } - break; - } - else if (ch == LIT_CHAR_MINUS) - { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'")); - } - - if (start != LIT_CHAR_UNDEF - && !is_range - && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE) - { - is_range = true; - continue; - } - } - else if (ch == LIT_CHAR_BACKSLASH) - { - if (parser_ctx_p->input_curr_p >= parser_ctx_p->input_end_p) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'")); - } - - ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); - - if (ch == LIT_CHAR_LOWERCASE_B) - { - ch = LIT_CHAR_BS; - } - else if (ch == LIT_CHAR_LOWERCASE_F) - { - ch = LIT_CHAR_FF; - } - else if (ch == LIT_CHAR_LOWERCASE_N) - { - ch = LIT_CHAR_LF; - } - else if (ch == LIT_CHAR_LOWERCASE_T) - { - ch = LIT_CHAR_TAB; - } - else if (ch == LIT_CHAR_LOWERCASE_R) - { - ch = LIT_CHAR_CR; - } - else if (ch == LIT_CHAR_LOWERCASE_V) - { - ch = LIT_CHAR_VTAB; - } - else if (ch == LIT_CHAR_LOWERCASE_C) - { - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) - { - ch = *parser_ctx_p->input_curr_p; - - if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) - || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END) - || (ch >= LIT_CHAR_0 && ch <= LIT_CHAR_9)) - { - /* See ECMA-262 v5, 15.10.2.10 (Point 3) */ - ch = (ch % 32); - parser_ctx_p->input_curr_p++; - } - else - { - ch = LIT_CHAR_LOWERCASE_C; - } - } - } - else if (ch == LIT_CHAR_LOWERCASE_X && re_hex_lookup (parser_ctx_p, 2)) - { - ecma_char_t code_unit; - - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'")); - } - - parser_ctx_p->input_curr_p += 2; - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && is_range == false - && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) - { - start = code_unit; - continue; - } - - ch = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_U && re_hex_lookup (parser_ctx_p, 4)) - { - ecma_char_t code_unit; - - if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit)) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'")); - } - - parser_ctx_p->input_curr_p += 4; - if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p - && is_range == false - && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) - { - start = code_unit; - continue; - } - - ch = code_unit; - } - else if (ch == LIT_CHAR_LOWERCASE_D) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_D) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1); - append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_LOWERCASE_S) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR); - append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP); - append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP); - append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */ - append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */ - append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */ - append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS); - append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */ - append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */ - append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */ - append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_S) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1); - append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1); - append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1); - append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL); - append_char_class (re_ctx_p, 0x1681UL, 0x180DUL); - append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL); - append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1); - append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL); - append_char_class (re_ctx_p, 0x2030UL, 0x205EUL); - append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL); - append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1); - append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_LOWERCASE_W) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9); - append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z); - append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE); - append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z); - ch = LIT_CHAR_UNDEF; - } - else if (ch == LIT_CHAR_UPPERCASE_W) - { - /* See ECMA-262 v5, 15.10.2.12 */ - append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1); - append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1); - append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1); - append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1); - append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX); - ch = LIT_CHAR_UNDEF; - } - else if (lit_char_is_octal_digit ((ecma_char_t) ch) - && ch != LIT_CHAR_0) - { - lit_utf8_decr (&parser_ctx_p->input_curr_p); - ch = (ecma_char_t) re_parse_octal (parser_ctx_p); - } - } /* ch == LIT_CHAR_BACKSLASH */ - - if (start != LIT_CHAR_UNDEF) - { - if (is_range) - { - if (start > ch) - { - return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, wrong order")); - } - else - { - append_char_class (re_ctx_p, start, ch); - start = LIT_CHAR_UNDEF; - is_range = false; - } - } - else - { - append_char_class (re_ctx_p, start, start); - start = ch; - } - } - else - { - start = ch; - } - } - while (token_type == RE_TOK_START_CHAR_CLASS || token_type == RE_TOK_START_INV_CHAR_CLASS); - - return re_parse_iterator (parser_ctx_p, out_token_p); -} /* re_parse_char_class */ - /** * Read the input pattern and parse the next token for the RegExp compiler * @@ -730,12 +472,12 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } else { - if (parser_ctx_p->num_of_groups == -1) + if (parser_ctx_p->groups_count == -1) { re_count_num_of_groups (parser_ctx_p); } - if (parser_ctx_p->num_of_groups) + if (parser_ctx_p->groups_count) { parser_ctx_p->input_curr_p--; uint32_t number = 0; @@ -765,7 +507,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } while (true); - if ((int) number <= parser_ctx_p->num_of_groups) + if ((int) number <= parser_ctx_p->groups_count) { out_token_p->type = RE_TOK_BACKREFERENCE; } diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h index 2a5c54eef..7e3c2e2c6 100644 --- a/jerry-core/parser/regexp/re-parser.h +++ b/jerry-core/parser/regexp/re-parser.h @@ -94,18 +94,14 @@ typedef struct const lit_utf8_byte_t *input_start_p; /**< start of input pattern */ const lit_utf8_byte_t *input_curr_p; /**< current position in input pattern */ const lit_utf8_byte_t *input_end_p; /**< end of input pattern */ - int num_of_groups; /**< number of groups */ - uint32_t num_of_classes; /**< number of character classes */ + int groups_count; /**< number of groups */ + uint32_t classes_count; /**< number of character classes */ } re_parser_ctx_t; -typedef void (*re_char_class_callback) (void *re_ctx_p, ecma_char_t start, ecma_char_t end); - -ecma_value_t -re_parse_char_class (re_parser_ctx_t *parser_ctx_p, re_char_class_callback append_char_class, void *re_ctx_p, - re_token_t *out_token_p); - -ecma_value_t -re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p); +bool re_hex_lookup (re_parser_ctx_t *parser_ctx_p, uint32_t lookup); +uint32_t re_parse_octal (re_parser_ctx_t *parser_ctx_p); +ecma_value_t re_parse_iterator (re_parser_ctx_t *parser_ctx_p, re_token_t *re_token_p); +ecma_value_t re_parse_next_token (re_parser_ctx_t *parser_ctx_p, re_token_t *out_token_p); /** * @} diff --git a/tests/jerry/es2015/regexp-lastindex.js b/tests/jerry/es2015/regexp-lastindex.js new file mode 100644 index 000000000..94af81d00 --- /dev/null +++ b/tests/jerry/es2015/regexp-lastindex.js @@ -0,0 +1,20 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var t = new RegExp ("abc","g"); +t.lastIndex = -12; +result = t.exec("abc abc"); +assert(result[0] === "abc"); +assert(result.index === 0); +assert(t.lastIndex === 3); diff --git a/tests/jerry/regression-test-issue-782.js b/tests/jerry/es2015/regression-test-issue-782.js similarity index 100% rename from tests/jerry/regression-test-issue-782.js rename to tests/jerry/es2015/regression-test-issue-782.js diff --git a/tests/jerry/es5.1/regexp-lastindex.js b/tests/jerry/es5.1/regexp-lastindex.js new file mode 100644 index 000000000..b5a758cd8 --- /dev/null +++ b/tests/jerry/es5.1/regexp-lastindex.js @@ -0,0 +1,19 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var t = new RegExp ("abc","g"); +t.lastIndex = -12; +result = t.exec("abc abc"); +assert(!result); +assert(t.lastIndex === 0); diff --git a/tests/jerry/regression-test-issue-312.js b/tests/jerry/es5.1/regression-test-issue-312.js similarity index 100% rename from tests/jerry/regression-test-issue-312.js rename to tests/jerry/es5.1/regression-test-issue-312.js diff --git a/tests/jerry/regexp-assertions.js b/tests/jerry/regexp-assertions.js index dd3a642f6..dcf4e0017 100644 --- a/tests/jerry/regexp-assertions.js +++ b/tests/jerry/regexp-assertions.js @@ -167,9 +167,3 @@ t.lastIndex = "2" var result = t.exec("abc abc"); assert(result[0] === "abc"); assert(result.index === 6); - -t = new RegExp ("abc","g"); -t.lastIndex = -12; -result = t.exec("abc abc"); -assert(!result); -assert(t.lastIndex === 0);