diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.h b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.h index 7086d4cae..ea355e8ef 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.h +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.h @@ -76,6 +76,7 @@ typedef struct const lit_utf8_byte_t *matched_p; /**< matched string */ lit_utf8_size_t matched_size; /**< matcehd string size */ lit_utf8_size_t match_byte_pos; /**< byte position of the match in the source string */ + ecma_length_t index; /**< current match index */ /** * Capture results diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index f283f98b0..1d27b061e 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -207,6 +207,36 @@ ecma_regexp_initialize_props (ecma_object_t *re_object_p, /**< RegExp object */ ecma_make_uint32_value (0)); } /* ecma_regexp_initialize_props */ +#if ENABLED (JERRY_ES2015) +/** + * Helper function to get current code point and advance the string pointer. + * + * @return lit_code_point_t current code point + */ +static lit_code_point_t +ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */ + const lit_utf8_byte_t *end_p) /**< string end pointer */ +{ + JERRY_ASSERT (str_p != NULL); + const lit_utf8_byte_t *current_p = *str_p; + + lit_code_point_t ch = lit_utf8_read_next (¤t_p); + if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch) + && current_p < end_p) + { + const ecma_char_t next_ch = lit_utf8_peek_next (current_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + lit_utf8_incr (¤t_p); + ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); + } + } + + *str_p = current_p; + return ch; +} /* ecma_regexp_unicode_advance */ +#endif /* ENABLED (JERRY_ES2015) */ + /** * RegExp object creation operation. * @@ -294,8 +324,8 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ * * @return ecma_char_t canonicalized character */ -ecma_char_t -ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */ +lit_code_point_t +ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */ { if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)) { @@ -307,8 +337,16 @@ ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */ return ch; } +#if ENABLED (JERRY_ES2015) + /* TODO: Implement case folding for code points in the upper planes. */ + if (JERRY_UNLIKELY (ch > LIT_UTF16_CODE_UNIT_MAX)) + { + return ch; + } +#endif /* ENABLED (JERRY_ES2015) */ + ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH]; - const ecma_length_t size = lit_char_to_upper_case (ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); + const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH); /* 3. */ if (size != 1) @@ -334,8 +372,8 @@ ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */ * * @return ecma_char_t canonicalized character */ -inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE -ecma_regexp_canonicalize (ecma_char_t ch, /**< character */ +inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE +ecma_regexp_canonicalize (lit_code_point_t ch, /**< character */ bool is_ignorecase) /**< IgnoreCase flag */ { if (is_ignorecase) @@ -386,8 +424,24 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ } const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; - ecma_char_t ch1 = (ecma_char_t) re_get_char (&bc_p); /* Already canonicalized. */ - ecma_char_t ch2 = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); + lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */ + lit_code_point_t ch2 = lit_utf8_read_next (&str_curr_p); + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE + && lit_is_code_point_utf16_high_surrogate (ch2) + && str_curr_p < re_ctx_p->input_end_p) + { + const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + lit_utf8_incr (&str_curr_p); + ch2 = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch2, next_ch); + } + } +#endif /* ENABLED (JERRY_ES2015) */ + + ch2 = ecma_regexp_canonicalize (ch2, is_ignorecase); JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2); if (ch1 != ch2) @@ -415,6 +469,19 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ return NULL; /* fail */ } +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE + && lit_is_code_point_utf16_high_surrogate (ch) + && str_curr_p < re_ctx_p->input_end_p) + { + const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + lit_utf8_incr (&str_curr_p); + } + } +#endif /* ENABLED (JERRY_ES2015) */ + JERRY_TRACE_MSG ("match\n"); break; /* tail merge */ } @@ -559,30 +626,63 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ return NULL; /* fail */ } - const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; - const ecma_char_t curr_ch = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase); - uint32_t range_count = re_get_value (&bc_p); + const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; bool is_match = false; - while (range_count-- > 0) +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) { - const ecma_char_t ch1 = re_get_char (&bc_p); - if (curr_ch < ch1) - { - bc_p += sizeof (ecma_char_t); - continue; - } + lit_code_point_t curr_ch = ecma_regexp_unicode_advance (&str_curr_p, + re_ctx_p->input_end_p); + curr_ch = ecma_regexp_canonicalize (curr_ch, is_ignorecase); - const ecma_char_t ch2 = re_get_char (&bc_p); - is_match = (curr_ch <= ch2); - if (is_match) + while (range_count-- > 0) { - /* Skip the remaining ranges in the bytecode. */ - bc_p += range_count * 2 * sizeof (ecma_char_t); - break; + const lit_code_point_t ch1 = re_get_value (&bc_p); + if (curr_ch < ch1) + { + bc_p += sizeof (uint32_t); + continue; + } + + const lit_code_point_t ch2 = re_get_value (&bc_p); + is_match = (curr_ch <= ch2); + if (is_match) + { + /* Skip the remaining ranges in the bytecode. */ + bc_p += range_count * 2 * sizeof (uint32_t); + break; + } } } + else + { +#endif /* ENABLED (JERRY_ES2015) */ + const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), + is_ignorecase); + + while (range_count-- > 0) + { + const ecma_char_t ch1 = re_get_char (&bc_p); + if (curr_ch < ch1) + { + bc_p += sizeof (ecma_char_t); + continue; + } + + const ecma_char_t ch2 = re_get_char (&bc_p); + is_match = (curr_ch <= ch2); + if (is_match) + { + /* Skip the remaining ranges in the bytecode. */ + bc_p += range_count * 2 * sizeof (ecma_char_t); + break; + } + } +#if ENABLED (JERRY_ES2015) + } +#endif /* ENABLED (JERRY_ES2015) */ JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS); @@ -1202,7 +1302,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ const lit_utf8_byte_t *input_curr_p = input_buffer_p; uint32_t index = 0; - if (re_ctx.flags & RE_FLAG_GLOBAL) + if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY)) { ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p); @@ -1270,10 +1370,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ uint8_t *bc_start_p = (uint8_t *) (bc_p + 1); const lit_utf8_byte_t *matched_p = NULL; - JERRY_TRACE_MSG ("Exec with flags [global: %d, ignoreCase: %d, multiline: %d]\n", - re_ctx.flags & RE_FLAG_GLOBAL, - re_ctx.flags & RE_FLAG_IGNORE_CASE, - re_ctx.flags & RE_FLAG_MULTILINE); + JERRY_TRACE_MSG ("Exec with flags [%x]\n", re_ctx.flags); JERRY_ASSERT (index <= input_length); while (true) @@ -1285,8 +1382,26 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ break; } - index++; - if (index > input_length) +#if ENABLED (JERRY_ES2015) + if (re_ctx.flags & RE_FLAG_STICKY) + { + ecma_value_t put_result = ecma_op_object_put (regexp_object_p, + ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), + ecma_make_uint32_value (0), + true); + if (ECMA_IS_VALUE_ERROR (put_result)) + { + ret_value = put_result; + goto cleanup_context; + } + + JERRY_ASSERT (ecma_is_value_boolean (put_result)); + ret_value = ECMA_VALUE_NULL; + goto cleanup_context; + } +#endif /* ENABLED (JERRY_ES2015) */ + + if (input_curr_p >= input_end_p) { if (re_ctx.flags & RE_FLAG_GLOBAL) { @@ -1309,6 +1424,24 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ } JERRY_ASSERT (input_curr_p < input_end_p); + +#if ENABLED (JERRY_ES2015) + if (re_ctx.flags & RE_FLAG_UNICODE) + { + index++; + const lit_code_point_t cp = ecma_regexp_unicode_advance (&input_curr_p, + input_end_p); + + if (cp > LIT_UTF16_CODE_UNIT_MAX) + { + index++; + } + + continue; + } +#endif /* ENABLED (JERRY_ES2015) */ + + index++; lit_utf8_incr (&input_curr_p); } @@ -1320,7 +1453,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */ goto cleanup_context; } - if (re_ctx.flags & RE_FLAG_GLOBAL) + if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY)) { JERRY_ASSERT (index <= input_length); @@ -1417,7 +1550,40 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**header.status_flags; + uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII; + lit_utf8_size_t string_length; + ctx_p->string_p = ecma_string_get_chars (string_p, + &(ctx_p->string_size), + &string_length, + NULL, + &string_flags); + const lit_utf8_byte_t *const string_end_p = ctx_p->string_p + ctx_p->string_size; + const uint8_t *const bc_start_p = (const uint8_t *) (bc_p + 1); + const lit_utf8_byte_t *matched_p = NULL; + const lit_utf8_byte_t *current_p = ctx_p->string_p; + const lit_utf8_byte_t *last_append_p = current_p; + JERRY_ASSERT (ctx_p->index <= string_length); + +#if ENABLED (JERRY_ES2015) + /* Global matches always start at index 0, but Sticky matches may have a non-zero lastIndex. */ + if (ctx_p->index > 0) + { + if (string_flags & ECMA_STRING_FLAG_IS_ASCII) + { + current_p += ctx_p->index; + } + else + { + ecma_length_t index = ctx_p->index; + while (index--) + { + lit_utf8_incr (¤t_p); + } + } + } +#endif /* ENABLED (JERRY_ES2015) */ + ecma_regexp_initialize_context (&re_ctx, bc_p, ctx_p->string_p, @@ -1427,12 +1593,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**capture_count = re_ctx.captures_count; ctx_p->u.captures_p = re_ctx.captures_p; - const uint8_t *const bc_start_p = (const uint8_t *) (bc_p + 1); - const lit_utf8_byte_t *matched_p = NULL; - const lit_utf8_byte_t *current_p = ctx_p->string_p; - const lit_utf8_byte_t *last_append_p = current_p; - uint32_t index = 0; - while (true) { memset (re_ctx.captures_p, 0, re_ctx.captures_count); @@ -1443,7 +1603,7 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**index)); ecma_ref_ecma_string (string_p); ecma_collection_push_back (arguments_p, ecma_make_string_value (string_p)); ecma_object_t *function_p = ecma_get_object_from_value (replace_arg); @@ -1508,9 +1668,9 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**end_p - global_capture_p->begin_p); - if (matched_size > 1) + if (matched_size > 0) { - index += lit_utf8_string_length (current_p, matched_size); + ctx_p->index += lit_utf8_string_length (current_p, matched_size); current_p = last_append_p; continue; } @@ -1521,7 +1681,23 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**index++; + const lit_code_point_t cp = ecma_regexp_unicode_advance (¤t_p, + string_end_p); + + if (cp > LIT_UTF16_CODE_UNIT_MAX) + { + ctx_p->index++; + } + + continue; + } +#endif /* ENABLED (JERRY_ES2015) */ + + ctx_p->index++; lit_utf8_incr (¤t_p); } @@ -1537,6 +1713,11 @@ cleanup_builder: cleanup_context: ecma_regexp_cleanup_context (&re_ctx); + if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED) + { + jmem_heap_free_block ((void *) ctx_p->string_p, ctx_p->string_size); + } + return result; } /* ecma_regexp_replace_helper_fast */ @@ -1564,6 +1745,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ ecma_object_t *this_obj_p = ecma_get_object_from_value (this_arg); ecma_replace_context_t replace_ctx; + replace_ctx.index = 0; /* 3. */ ecma_string_t *string_p = ecma_op_to_string (string_arg); @@ -1572,14 +1754,6 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ return ECMA_VALUE_ERROR; } - lit_utf8_size_t string_length; - uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII; - replace_ctx.string_p = ecma_string_get_chars (string_p, - &(replace_ctx.string_size), - &string_length, - NULL, - &string_flags); - ecma_value_t result = ECMA_VALUE_ERROR; /* 6. */ @@ -1604,9 +1778,25 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ const bool global = ecma_op_to_boolean (result); ecma_free_value (result); +#if ENABLED (JERRY_ES2015) + const lit_utf8_size_t string_length = ecma_string_get_length (string_p); + bool unicode = false; +#endif /* ENABLED (JERRY_ES2015) */ + /* 10. */ if (global) { +#if ENABLED (JERRY_ES2015) + result = ecma_op_object_get_by_magic_id (this_obj_p, LIT_MAGIC_STRING_UNICODE); + if (ECMA_IS_VALUE_ERROR (result)) + { + goto cleanup_replace; + } + + unicode = ecma_op_to_boolean (result); + ecma_free_value (result); +#endif /* ENABLED (JERRY_ES2015) */ + result = ecma_op_object_put (this_obj_p, ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), ecma_make_uint32_value (0), @@ -1661,6 +1851,44 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ && !ecma_builtin_is (this_obj_p, ECMA_BUILTIN_ID_REGEXP_PROTOTYPE) && ecma_builtin_is_regexp_exec (function_p)) { + result = ecma_op_object_get_by_magic_id (this_obj_p, LIT_MAGIC_STRING_STICKY); + if (ECMA_IS_VALUE_ERROR (result)) + { + goto cleanup_replace; + } + + const bool sticky = ecma_op_to_boolean (result); + ecma_free_value (result); + + if (sticky && !global) + { + ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); + ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (this_obj_p, lastindex_str_p); + + result = ecma_op_to_length (lastindex_value, &replace_ctx.index); + ecma_free_value (lastindex_value); + + if (ECMA_IS_VALUE_ERROR (result)) + { + goto cleanup_replace; + } + + if (replace_ctx.index > string_length) + { + ecma_deref_object ((ecma_object_t *) function_p); + + result = ecma_op_object_put (this_obj_p, + ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), + ecma_make_uint32_value (0), + true); + JERRY_ASSERT (ecma_is_value_true (result)); + + ecma_ref_ecma_string (string_p); + result = ecma_make_string_value (string_p); + goto cleanup_replace; + } + } + ecma_extended_object_t *re_obj_p = (ecma_extended_object_t *) this_obj_p; const re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t, re_obj_p->u.class_prop.u.value); @@ -1756,8 +1984,8 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ goto cleanup_results; } - uint32_t length; - if (ECMA_IS_VALUE_ERROR (ecma_op_to_length (result, &length))) + uint32_t index; + if (ECMA_IS_VALUE_ERROR (ecma_op_to_length (result, &index))) { ecma_free_value (result); result = ECMA_VALUE_ERROR; @@ -1766,10 +1994,12 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ ecma_free_value (result); + index = ecma_op_advance_string_index (string_p, index, unicode); + /* 10.d.iii.3.c */ result = ecma_op_object_put (this_obj_p, ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL), - ecma_make_uint32_value (length + 1), + ecma_make_uint32_value (index), true); if (ECMA_IS_VALUE_ERROR (result)) @@ -1788,6 +2018,13 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ } } + uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII; + replace_ctx.string_p = ecma_string_get_chars (string_p, + &(replace_ctx.string_size), + NULL, + NULL, + &string_flags); + /* 14. */ replace_ctx.builder = ecma_stringbuilder_create (); replace_ctx.matched_p = NULL; @@ -1795,8 +2032,6 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ /* 15. */ const lit_utf8_byte_t *source_position_p = replace_ctx.string_p; - lit_utf8_size_t source_index = 0; - const lit_utf8_byte_t *const string_end_p = replace_ctx.string_p + replace_ctx.string_size; /* 16. */ @@ -1893,7 +2128,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ n++; } - const bool should_replace = (position >= source_index); + const bool should_replace = (position >= replace_ctx.index); /* 16.p */ if (should_replace) { @@ -1908,7 +2143,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ else { match_position_p = source_position_p; - lit_utf8_size_t distance = position - source_index; + lit_utf8_size_t distance = position - replace_ctx.index; while (distance--) { lit_utf8_incr (&match_position_p); @@ -1921,7 +2156,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ replace_ctx.match_byte_pos = (lit_utf8_size_t) (match_position_p - replace_ctx.string_p); source_position_p = JERRY_MIN (match_position_p + matched_str_size, string_end_p); - source_index = JERRY_MIN (position + matched_str_length, string_length); + replace_ctx.index = JERRY_MIN (position + matched_str_length, string_length); } /* 16.m */ @@ -1979,7 +2214,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ } /* 18. */ - JERRY_ASSERT (source_index <= string_length); + JERRY_ASSERT (replace_ctx.index <= string_length); ecma_stringbuilder_append_raw (&(replace_ctx.builder), source_position_p, (lit_utf8_size_t) (string_end_p - source_position_p)); @@ -1990,6 +2225,11 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */ cleanup_builder: ecma_stringbuilder_destroy (&replace_ctx.builder); + if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED) + { + jmem_heap_free_block ((void *) replace_ctx.string_p, replace_ctx.string_size); + } + cleanup_results: ecma_collection_free (results_p); #endif /* !ENABLED (JERRY_ES2015) */ @@ -2001,11 +2241,6 @@ cleanup_replace: } cleanup_string: - if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED) - { - jmem_heap_free_block ((void *) replace_ctx.string_p, replace_ctx.string_size); - } - ecma_deref_ecma_string (string_p); return result; diff --git a/jerry-core/ecma/operations/ecma-regexp-object.h b/jerry-core/ecma/operations/ecma-regexp-object.h index 4edce41fd..1b5c023aa 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.h +++ b/jerry-core/ecma/operations/ecma-regexp-object.h @@ -101,8 +101,8 @@ ecma_value_t ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *byt ecma_value_t ecma_op_create_regexp_object (ecma_string_t *pattern_p, uint16_t flags); ecma_value_t ecma_regexp_exec_helper (ecma_value_t regexp_value, ecma_value_t input_string, bool ignore_global); ecma_string_t *ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg); -ecma_char_t ecma_regexp_canonicalize (ecma_char_t ch, bool is_ignorecase); -ecma_char_t ecma_regexp_canonicalize_char (ecma_char_t ch); +lit_code_point_t ecma_regexp_canonicalize (lit_code_point_t ch, bool is_ignorecase); +lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch); ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p); void ecma_regexp_initialize_props (ecma_object_t *re_obj_p, ecma_string_t *source_p, uint16_t flags); diff --git a/jerry-core/lit/lit-magic-strings.inc.h b/jerry-core/lit/lit-magic-strings.inc.h index 520331cfe..4e18cfb2e 100644 --- a/jerry-core/lit/lit-magic-strings.inc.h +++ b/jerry-core/lit/lit-magic-strings.inc.h @@ -338,7 +338,8 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SOURCE, "source") #if ENABLED (JERRY_BUILTIN_ARRAY) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SPLICE, "splice") #endif -#if ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) +#if ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \ +|| ENABLED (JERRY_BUILTIN_REGEXP) && !( !ENABLED (JERRY_ES2015)) LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_STICKY, "sticky") #endif LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_STRING, "string") diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c index d57a9224f..e3b09ac69 100644 --- a/jerry-core/parser/js/js-lexer.c +++ b/jerry-core/parser/js/js-lexer.c @@ -2317,6 +2317,14 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */ { flag = RE_FLAG_MULTILINE; } + else if (source_p[0] == LIT_CHAR_LOWERCASE_U) + { + flag = RE_FLAG_UNICODE; + } + else if (source_p[0] == LIT_CHAR_LOWERCASE_Y) + { + flag = RE_FLAG_STICKY; + } if (flag == 0) { diff --git a/jerry-core/parser/regexp/re-bytecode.c b/jerry-core/parser/regexp/re-bytecode.c index 9b547c66c..1722f0c2b 100644 --- a/jerry-core/parser/regexp/re-bytecode.c +++ b/jerry-core/parser/regexp/re-bytecode.c @@ -15,6 +15,7 @@ #include "ecma-globals.h" #include "re-bytecode.h" +#include "ecma-regexp-object.h" #if ENABLED (JERRY_BUILTIN_REGEXP) @@ -455,8 +456,16 @@ re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ JERRY_DEBUG_MSG ("%d", num_of_class); while (num_of_class) { - JERRY_DEBUG_MSG (" %d", re_get_char (&bytecode_p)); - JERRY_DEBUG_MSG ("-%d", re_get_char (&bytecode_p)); + if ((compiled_code_p->header.status_flags & RE_FLAG_UNICODE) != 0) + { + JERRY_DEBUG_MSG (" %u", re_get_value (&bytecode_p)); + JERRY_DEBUG_MSG ("-%u", re_get_value (&bytecode_p)); + } + else + { + JERRY_DEBUG_MSG (" %u", re_get_char (&bytecode_p)); + JERRY_DEBUG_MSG ("-%u", re_get_char (&bytecode_p)); + } num_of_class--; } JERRY_DEBUG_MSG (", "); diff --git a/jerry-core/parser/regexp/re-compiler.c b/jerry-core/parser/regexp/re-compiler.c index 45f89c519..b2134ac54 100644 --- a/jerry-core/parser/regexp/re-compiler.c +++ b/jerry-core/parser/regexp/re-compiler.c @@ -226,12 +226,29 @@ re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compile */ static void re_append_char_class (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - ecma_char_t start, /**< character class range from */ - ecma_char_t end) /**< character class range to */ + lit_code_point_t start, /**< character class range from */ + lit_code_point_t end) /**< character class range to */ { - re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); - re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); re_ctx_p->parser_ctx_p->classes_count++; + +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE) + { + re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + return; + } +#endif /* ENABLED (JERRY_ES2015) */ + + JERRY_ASSERT (start <= LIT_UTF16_CODE_UNIT_MAX); + JERRY_ASSERT (end <= LIT_UTF16_CODE_UNIT_MAX); + + re_append_char (re_ctx_p->bytecode_ctx_p, + (ecma_char_t) ecma_regexp_canonicalize (start, + re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_append_char (re_ctx_p->bytecode_ctx_p, + (ecma_char_t) ecma_regexp_canonicalize (end, + re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); } /* re_append_char_class */ /** @@ -250,7 +267,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ out_token_p->qmax = out_token_p->qmin = 1; parser_ctx_p->classes_count = 0; - ecma_char_t start = LIT_CHAR_UNDEF; + lit_code_point_t start = LIT_CHAR_UNDEF; bool is_range = false; const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS || re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS); @@ -269,7 +286,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string")); } - ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + lit_code_point_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); if (ch == LIT_CHAR_RIGHT_SQUARE) { @@ -459,6 +476,20 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ } } /* ch == LIT_CHAR_BACKSLASH */ +#if ENABLED (JERRY_ES2015) + if (re_ctx_p->flags & RE_FLAG_UNICODE + && lit_is_code_point_utf16_high_surrogate (ch) + && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) + { + const ecma_char_t next_ch = lit_utf8_peek_next (parser_ctx_p->input_curr_p); + if (lit_is_code_point_utf16_low_surrogate (next_ch)) + { + ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); + lit_utf8_incr (&parser_ctx_p->input_curr_p); + } + } +#endif /* ENABLED (JERRY_ES2015) */ + if (start != LIT_CHAR_UNDEF) { if (is_range) @@ -559,8 +590,8 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context (unsigned int) re_ctx_p->current_token.qmax); re_append_opcode (bc_ctx_p, RE_OP_CHAR); - re_append_char (bc_ctx_p, ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, - re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); + re_append_char (bc_ctx_p, (ecma_char_t) ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value, + re_ctx_p->flags & RE_FLAG_IGNORE_CASE)); ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); break; diff --git a/tests/jerry/es2015/regexp-flags.js b/tests/jerry/es2015/regexp-flags.js new file mode 100644 index 000000000..9a2eb90a6 --- /dev/null +++ b/tests/jerry/es2015/regexp-flags.js @@ -0,0 +1,62 @@ +// Copyright JS Foundation and other contributors, http://js.foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var r = /[𐲡-𐲱𐲟]/u; + +var m = r.exec("𐲬"); +assert(m !== null); +assert(m[0] === "𐲬"); + +r = /[𐲡E]/ug; +assert (r.exec("E𐲡E")[0] === 'E'); +assert (r.exec("E𐲡E")[0] === '𐲡'); +assert (r.exec("E𐲡E")[0] === 'E'); + +try { + eval("/[𐲡-𐲱𐲟]/"); + assert (false); +} catch (e) { + assert (e instanceof SyntaxError); +} + +assert (/\udc96/.exec("\ud803\udc96")[0] === '\udc96'); +assert (/\udc96/u.exec("\ud803\udc96") === null); +assert (/\udc96/u.exec("\udc96")[0] === '\udc96'); + +assert (/\ud803/.exec("\ud803\udc96")[0] === '\ud803'); +assert (/\ud803/u.exec("\ud803\udc96") === null); +assert (/\ud803/u.exec("\ud803")[0] === '\ud803'); + +assert (/./u.exec("\ud803\udc96")[0] === '𐲖'); +assert (/./.exec("\ud803\udc96")[0] === '\ud803'); +assert (/./u.exec("\ud803\ud803")[0] === '\ud803'); +assert (/./u.exec("\udc96\udc96")[0] === '\udc96'); +assert (/./u.exec("\ud803")[0] === '\ud803'); + +var r = /abc/y; +m = r.exec ("strabcstr"); +assert (m === null); + +r.lastIndex = 3; +m = r.exec ("strabcstr"); +assert (m[0] === "abc"); +assert (r.lastIndex === 6); + +m = r.exec ("strabcstr"); +assert (m === null); +assert (r.lastIndex === 0); + +var r = /abc/yg; +m = r.exec ("strabcstr"); +assert (m === null); diff --git a/tests/jerry/es2015/regexp-routines.js b/tests/jerry/es2015/regexp-routines.js index 268ec5526..7a750ba1f 100644 --- a/tests/jerry/es2015/regexp-routines.js +++ b/tests/jerry/es2015/regexp-routines.js @@ -84,3 +84,63 @@ try { } catch (e) { assert (e === "abrupt flags toString"); } + +var o = { + global: true, + source: "str" +} + +Object.defineProperty(o, 'unicode', { 'get': function () {throw "abrupt unicode get"; }}); +try { + RegExp.prototype[Symbol.match].call(o, "str"); + assert (false); +} catch (e) { + assert (e === "abrupt unicode get"); +} + +assert ("str𐲡fgh".replace(/(?:)/gu, "x") === 'xsxtxrx𐲡xfxgxhx'); +assert ("str𐲡fgh".replace(/(?:)/g, "x") === 'xsxtxrx\ud803x\udca1xfxgxhx'); + +r = /(?:)/gu; +/* Disable fast path. */ +r.exec = function (s) { return RegExp.prototype.exec.call(this, s); }; + +assert ("str𐲡fgh".replace(r, "x") === 'xsxtxrx𐲡xfxgxhx'); +Object.defineProperty(r, 'unicode', {value: false}); +assert ("str𐲡fgh".replace(r, "x") === 'xsxtxrx\ud803x\udca1xfxgxhx'); + +r = /(?:)/gu; +assert (RegExp.prototype[Symbol.match].call(r, "str𐲡fgh").length === 8); +Object.defineProperty(r, 'unicode', {value: false}); +assert (RegExp.prototype[Symbol.match].call(r, "str𐲡fgh").length === 9); + +r = /(?:)/gy; +r.lastIndex = 2; +assert ("asd".replace(r, "x") === "xaxsxdx"); +assert (r.lastIndex === 0); + +r.lastIndex = 5; +assert ("asd".replace(r, "x") === "xaxsxdx"); +assert (r.lastIndex === 0); + +r = /(?:)/y; +r.lastIndex = 2; +assert ("asd".replace(r, "x") === "asxd"); +assert (r.lastIndex === 2); + +r.lastIndex = 5; +assert ("asd".replace(r, "x") === "asd"); +assert (r.lastIndex === 0); + +r.lastIndex = 2; +/* Disable fast path. */ +r.exec = function (s) { return RegExp.prototype.exec.call(this, s); }; +assert ("asd".replace(r, "x") === "asxd"); +assert (r.lastIndex === 2); + +r.lastIndex = 5; +assert ("asd".replace(r, "x") === "asd"); +assert (r.lastIndex === 0); + +assert (RegExp.prototype[Symbol.match].call(/a/y, "aaa").length === 1); +assert (RegExp.prototype[Symbol.match].call(/a/gy, "aaa").length === 3);