Implement RegExp unicode and sticky flags (#3379)

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai@inf.u-szeged.hu
This commit is contained in:
Dániel Bátyai 2019-11-29 14:08:30 +01:00 committed by GitHub
parent 8956eff2bd
commit 35c0a6e299
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 484 additions and 77 deletions

View File

@ -76,6 +76,7 @@ typedef struct
const lit_utf8_byte_t *matched_p; /**< matched string */
lit_utf8_size_t matched_size; /**< matcehd string size */
lit_utf8_size_t match_byte_pos; /**< byte position of the match in the source string */
ecma_length_t index; /**< current match index */
/**
* Capture results

View File

@ -207,6 +207,36 @@ ecma_regexp_initialize_props (ecma_object_t *re_object_p, /**< RegExp object */
ecma_make_uint32_value (0));
} /* ecma_regexp_initialize_props */
#if ENABLED (JERRY_ES2015)
/**
* Helper function to get current code point and advance the string pointer.
*
* @return lit_code_point_t current code point
*/
static lit_code_point_t
ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to string pointer */
const lit_utf8_byte_t *end_p) /**< string end pointer */
{
JERRY_ASSERT (str_p != NULL);
const lit_utf8_byte_t *current_p = *str_p;
lit_code_point_t ch = lit_utf8_read_next (&current_p);
if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch)
&& current_p < end_p)
{
const ecma_char_t next_ch = lit_utf8_peek_next (current_p);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
lit_utf8_incr (&current_p);
ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch);
}
}
*str_p = current_p;
return ch;
} /* ecma_regexp_unicode_advance */
#endif /* ENABLED (JERRY_ES2015) */
/**
* RegExp object creation operation.
*
@ -294,8 +324,8 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */
*
* @return ecma_char_t canonicalized character
*/
ecma_char_t
ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */
lit_code_point_t
ecma_regexp_canonicalize_char (lit_code_point_t ch) /**< character */
{
if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
{
@ -307,8 +337,16 @@ ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */
return ch;
}
#if ENABLED (JERRY_ES2015)
/* TODO: Implement case folding for code points in the upper planes. */
if (JERRY_UNLIKELY (ch > LIT_UTF16_CODE_UNIT_MAX))
{
return ch;
}
#endif /* ENABLED (JERRY_ES2015) */
ecma_char_t u[LIT_MAXIMUM_OTHER_CASE_LENGTH];
const ecma_length_t size = lit_char_to_upper_case (ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH);
const ecma_length_t size = lit_char_to_upper_case ((ecma_char_t) ch, u, LIT_MAXIMUM_OTHER_CASE_LENGTH);
/* 3. */
if (size != 1)
@ -334,8 +372,8 @@ ecma_regexp_canonicalize_char (ecma_char_t ch) /**< character */
*
* @return ecma_char_t canonicalized character
*/
inline ecma_char_t JERRY_ATTR_ALWAYS_INLINE
ecma_regexp_canonicalize (ecma_char_t ch, /**< character */
inline lit_code_point_t JERRY_ATTR_ALWAYS_INLINE
ecma_regexp_canonicalize (lit_code_point_t ch, /**< character */
bool is_ignorecase) /**< IgnoreCase flag */
{
if (is_ignorecase)
@ -386,8 +424,24 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
ecma_char_t ch1 = (ecma_char_t) re_get_char (&bc_p); /* Already canonicalized. */
ecma_char_t ch2 = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase);
lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */
lit_code_point_t ch2 = lit_utf8_read_next (&str_curr_p);
#if ENABLED (JERRY_ES2015)
if (re_ctx_p->flags & RE_FLAG_UNICODE
&& lit_is_code_point_utf16_high_surrogate (ch2)
&& str_curr_p < re_ctx_p->input_end_p)
{
const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
lit_utf8_incr (&str_curr_p);
ch2 = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch2, next_ch);
}
}
#endif /* ENABLED (JERRY_ES2015) */
ch2 = ecma_regexp_canonicalize (ch2, is_ignorecase);
JERRY_TRACE_MSG ("Character matching %d to %d: ", ch1, ch2);
if (ch1 != ch2)
@ -415,6 +469,19 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
return NULL; /* fail */
}
#if ENABLED (JERRY_ES2015)
if (re_ctx_p->flags & RE_FLAG_UNICODE
&& lit_is_code_point_utf16_high_surrogate (ch)
&& str_curr_p < re_ctx_p->input_end_p)
{
const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
lit_utf8_incr (&str_curr_p);
}
}
#endif /* ENABLED (JERRY_ES2015) */
JERRY_TRACE_MSG ("match\n");
break; /* tail merge */
}
@ -559,30 +626,63 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */
return NULL; /* fail */
}
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
const ecma_char_t curr_ch = ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), is_ignorecase);
uint32_t range_count = re_get_value (&bc_p);
const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE;
bool is_match = false;
while (range_count-- > 0)
#if ENABLED (JERRY_ES2015)
if (re_ctx_p->flags & RE_FLAG_UNICODE)
{
const ecma_char_t ch1 = re_get_char (&bc_p);
if (curr_ch < ch1)
{
bc_p += sizeof (ecma_char_t);
continue;
}
lit_code_point_t curr_ch = ecma_regexp_unicode_advance (&str_curr_p,
re_ctx_p->input_end_p);
curr_ch = ecma_regexp_canonicalize (curr_ch, is_ignorecase);
const ecma_char_t ch2 = re_get_char (&bc_p);
is_match = (curr_ch <= ch2);
if (is_match)
while (range_count-- > 0)
{
/* Skip the remaining ranges in the bytecode. */
bc_p += range_count * 2 * sizeof (ecma_char_t);
break;
const lit_code_point_t ch1 = re_get_value (&bc_p);
if (curr_ch < ch1)
{
bc_p += sizeof (uint32_t);
continue;
}
const lit_code_point_t ch2 = re_get_value (&bc_p);
is_match = (curr_ch <= ch2);
if (is_match)
{
/* Skip the remaining ranges in the bytecode. */
bc_p += range_count * 2 * sizeof (uint32_t);
break;
}
}
}
else
{
#endif /* ENABLED (JERRY_ES2015) */
const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p),
is_ignorecase);
while (range_count-- > 0)
{
const ecma_char_t ch1 = re_get_char (&bc_p);
if (curr_ch < ch1)
{
bc_p += sizeof (ecma_char_t);
continue;
}
const ecma_char_t ch2 = re_get_char (&bc_p);
is_match = (curr_ch <= ch2);
if (is_match)
{
/* Skip the remaining ranges in the bytecode. */
bc_p += range_count * 2 * sizeof (ecma_char_t);
break;
}
}
#if ENABLED (JERRY_ES2015)
}
#endif /* ENABLED (JERRY_ES2015) */
JERRY_ASSERT (op == RE_OP_CHAR_CLASS || op == RE_OP_INV_CHAR_CLASS);
@ -1202,7 +1302,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
const lit_utf8_byte_t *input_curr_p = input_buffer_p;
uint32_t index = 0;
if (re_ctx.flags & RE_FLAG_GLOBAL)
if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY))
{
ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL);
ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (regexp_object_p, lastindex_str_p);
@ -1270,10 +1370,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
uint8_t *bc_start_p = (uint8_t *) (bc_p + 1);
const lit_utf8_byte_t *matched_p = NULL;
JERRY_TRACE_MSG ("Exec with flags [global: %d, ignoreCase: %d, multiline: %d]\n",
re_ctx.flags & RE_FLAG_GLOBAL,
re_ctx.flags & RE_FLAG_IGNORE_CASE,
re_ctx.flags & RE_FLAG_MULTILINE);
JERRY_TRACE_MSG ("Exec with flags [%x]\n", re_ctx.flags);
JERRY_ASSERT (index <= input_length);
while (true)
@ -1285,8 +1382,26 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
break;
}
index++;
if (index > input_length)
#if ENABLED (JERRY_ES2015)
if (re_ctx.flags & RE_FLAG_STICKY)
{
ecma_value_t put_result = ecma_op_object_put (regexp_object_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (0),
true);
if (ECMA_IS_VALUE_ERROR (put_result))
{
ret_value = put_result;
goto cleanup_context;
}
JERRY_ASSERT (ecma_is_value_boolean (put_result));
ret_value = ECMA_VALUE_NULL;
goto cleanup_context;
}
#endif /* ENABLED (JERRY_ES2015) */
if (input_curr_p >= input_end_p)
{
if (re_ctx.flags & RE_FLAG_GLOBAL)
{
@ -1309,6 +1424,24 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
}
JERRY_ASSERT (input_curr_p < input_end_p);
#if ENABLED (JERRY_ES2015)
if (re_ctx.flags & RE_FLAG_UNICODE)
{
index++;
const lit_code_point_t cp = ecma_regexp_unicode_advance (&input_curr_p,
input_end_p);
if (cp > LIT_UTF16_CODE_UNIT_MAX)
{
index++;
}
continue;
}
#endif /* ENABLED (JERRY_ES2015) */
index++;
lit_utf8_incr (&input_curr_p);
}
@ -1320,7 +1453,7 @@ ecma_regexp_exec_helper (ecma_value_t regexp_value, /**< RegExp object */
goto cleanup_context;
}
if (re_ctx.flags & RE_FLAG_GLOBAL)
if (re_ctx.flags & (RE_FLAG_GLOBAL | RE_FLAG_STICKY))
{
JERRY_ASSERT (index <= input_length);
@ -1417,7 +1550,40 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
ecma_regexp_ctx_t re_ctx;
re_ctx.flags = bc_p->header.status_flags;
uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII;
lit_utf8_size_t string_length;
ctx_p->string_p = ecma_string_get_chars (string_p,
&(ctx_p->string_size),
&string_length,
NULL,
&string_flags);
const lit_utf8_byte_t *const string_end_p = ctx_p->string_p + ctx_p->string_size;
const uint8_t *const bc_start_p = (const uint8_t *) (bc_p + 1);
const lit_utf8_byte_t *matched_p = NULL;
const lit_utf8_byte_t *current_p = ctx_p->string_p;
const lit_utf8_byte_t *last_append_p = current_p;
JERRY_ASSERT (ctx_p->index <= string_length);
#if ENABLED (JERRY_ES2015)
/* Global matches always start at index 0, but Sticky matches may have a non-zero lastIndex. */
if (ctx_p->index > 0)
{
if (string_flags & ECMA_STRING_FLAG_IS_ASCII)
{
current_p += ctx_p->index;
}
else
{
ecma_length_t index = ctx_p->index;
while (index--)
{
lit_utf8_incr (&current_p);
}
}
}
#endif /* ENABLED (JERRY_ES2015) */
ecma_regexp_initialize_context (&re_ctx,
bc_p,
ctx_p->string_p,
@ -1427,12 +1593,6 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
ctx_p->capture_count = re_ctx.captures_count;
ctx_p->u.captures_p = re_ctx.captures_p;
const uint8_t *const bc_start_p = (const uint8_t *) (bc_p + 1);
const lit_utf8_byte_t *matched_p = NULL;
const lit_utf8_byte_t *current_p = ctx_p->string_p;
const lit_utf8_byte_t *last_append_p = current_p;
uint32_t index = 0;
while (true)
{
memset (re_ctx.captures_p, 0, re_ctx.captures_count);
@ -1443,7 +1603,7 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
if (ECMA_RE_STACK_LIMIT_REACHED (matched_p))
{
result = ecma_raise_range_error (ECMA_ERR_MSG ("Stack limit exceeded."));
goto cleanup_context;
goto cleanup_builder;
}
const lit_utf8_size_t remaining_size = (lit_utf8_size_t) (current_p - last_append_p);
@ -1468,7 +1628,7 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
ecma_collection_push_back (arguments_p, capture);
}
ecma_collection_push_back (arguments_p, ecma_make_uint32_value (index));
ecma_collection_push_back (arguments_p, ecma_make_uint32_value (ctx_p->index));
ecma_ref_ecma_string (string_p);
ecma_collection_push_back (arguments_p, ecma_make_string_value (string_p));
ecma_object_t *function_p = ecma_get_object_from_value (replace_arg);
@ -1508,9 +1668,9 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
}
const lit_utf8_size_t matched_size = (lit_utf8_size_t) (global_capture_p->end_p - global_capture_p->begin_p);
if (matched_size > 1)
if (matched_size > 0)
{
index += lit_utf8_string_length (current_p, matched_size);
ctx_p->index += lit_utf8_string_length (current_p, matched_size);
current_p = last_append_p;
continue;
}
@ -1521,7 +1681,23 @@ ecma_regexp_replace_helper_fast (ecma_replace_context_t *ctx_p, /**<replace cont
break;
}
index++;
#if ENABLED (JERRY_ES2015)
if ((re_ctx.flags & RE_FLAG_UNICODE) != 0)
{
ctx_p->index++;
const lit_code_point_t cp = ecma_regexp_unicode_advance (&current_p,
string_end_p);
if (cp > LIT_UTF16_CODE_UNIT_MAX)
{
ctx_p->index++;
}
continue;
}
#endif /* ENABLED (JERRY_ES2015) */
ctx_p->index++;
lit_utf8_incr (&current_p);
}
@ -1537,6 +1713,11 @@ cleanup_builder:
cleanup_context:
ecma_regexp_cleanup_context (&re_ctx);
if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED)
{
jmem_heap_free_block ((void *) ctx_p->string_p, ctx_p->string_size);
}
return result;
} /* ecma_regexp_replace_helper_fast */
@ -1564,6 +1745,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
ecma_object_t *this_obj_p = ecma_get_object_from_value (this_arg);
ecma_replace_context_t replace_ctx;
replace_ctx.index = 0;
/* 3. */
ecma_string_t *string_p = ecma_op_to_string (string_arg);
@ -1572,14 +1754,6 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
return ECMA_VALUE_ERROR;
}
lit_utf8_size_t string_length;
uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII;
replace_ctx.string_p = ecma_string_get_chars (string_p,
&(replace_ctx.string_size),
&string_length,
NULL,
&string_flags);
ecma_value_t result = ECMA_VALUE_ERROR;
/* 6. */
@ -1604,9 +1778,25 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
const bool global = ecma_op_to_boolean (result);
ecma_free_value (result);
#if ENABLED (JERRY_ES2015)
const lit_utf8_size_t string_length = ecma_string_get_length (string_p);
bool unicode = false;
#endif /* ENABLED (JERRY_ES2015) */
/* 10. */
if (global)
{
#if ENABLED (JERRY_ES2015)
result = ecma_op_object_get_by_magic_id (this_obj_p, LIT_MAGIC_STRING_UNICODE);
if (ECMA_IS_VALUE_ERROR (result))
{
goto cleanup_replace;
}
unicode = ecma_op_to_boolean (result);
ecma_free_value (result);
#endif /* ENABLED (JERRY_ES2015) */
result = ecma_op_object_put (this_obj_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (0),
@ -1661,6 +1851,44 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
&& !ecma_builtin_is (this_obj_p, ECMA_BUILTIN_ID_REGEXP_PROTOTYPE)
&& ecma_builtin_is_regexp_exec (function_p))
{
result = ecma_op_object_get_by_magic_id (this_obj_p, LIT_MAGIC_STRING_STICKY);
if (ECMA_IS_VALUE_ERROR (result))
{
goto cleanup_replace;
}
const bool sticky = ecma_op_to_boolean (result);
ecma_free_value (result);
if (sticky && !global)
{
ecma_string_t *lastindex_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL);
ecma_value_t lastindex_value = ecma_op_object_get_own_data_prop (this_obj_p, lastindex_str_p);
result = ecma_op_to_length (lastindex_value, &replace_ctx.index);
ecma_free_value (lastindex_value);
if (ECMA_IS_VALUE_ERROR (result))
{
goto cleanup_replace;
}
if (replace_ctx.index > string_length)
{
ecma_deref_object ((ecma_object_t *) function_p);
result = ecma_op_object_put (this_obj_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (0),
true);
JERRY_ASSERT (ecma_is_value_true (result));
ecma_ref_ecma_string (string_p);
result = ecma_make_string_value (string_p);
goto cleanup_replace;
}
}
ecma_extended_object_t *re_obj_p = (ecma_extended_object_t *) this_obj_p;
const re_compiled_code_t *bc_p = ECMA_GET_INTERNAL_VALUE_ANY_POINTER (re_compiled_code_t,
re_obj_p->u.class_prop.u.value);
@ -1756,8 +1984,8 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
goto cleanup_results;
}
uint32_t length;
if (ECMA_IS_VALUE_ERROR (ecma_op_to_length (result, &length)))
uint32_t index;
if (ECMA_IS_VALUE_ERROR (ecma_op_to_length (result, &index)))
{
ecma_free_value (result);
result = ECMA_VALUE_ERROR;
@ -1766,10 +1994,12 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
ecma_free_value (result);
index = ecma_op_advance_string_index (string_p, index, unicode);
/* 10.d.iii.3.c */
result = ecma_op_object_put (this_obj_p,
ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL),
ecma_make_uint32_value (length + 1),
ecma_make_uint32_value (index),
true);
if (ECMA_IS_VALUE_ERROR (result))
@ -1788,6 +2018,13 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
}
}
uint8_t string_flags = ECMA_STRING_FLAG_IS_ASCII;
replace_ctx.string_p = ecma_string_get_chars (string_p,
&(replace_ctx.string_size),
NULL,
NULL,
&string_flags);
/* 14. */
replace_ctx.builder = ecma_stringbuilder_create ();
replace_ctx.matched_p = NULL;
@ -1795,8 +2032,6 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
/* 15. */
const lit_utf8_byte_t *source_position_p = replace_ctx.string_p;
lit_utf8_size_t source_index = 0;
const lit_utf8_byte_t *const string_end_p = replace_ctx.string_p + replace_ctx.string_size;
/* 16. */
@ -1893,7 +2128,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
n++;
}
const bool should_replace = (position >= source_index);
const bool should_replace = (position >= replace_ctx.index);
/* 16.p */
if (should_replace)
{
@ -1908,7 +2143,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
else
{
match_position_p = source_position_p;
lit_utf8_size_t distance = position - source_index;
lit_utf8_size_t distance = position - replace_ctx.index;
while (distance--)
{
lit_utf8_incr (&match_position_p);
@ -1921,7 +2156,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
replace_ctx.match_byte_pos = (lit_utf8_size_t) (match_position_p - replace_ctx.string_p);
source_position_p = JERRY_MIN (match_position_p + matched_str_size, string_end_p);
source_index = JERRY_MIN (position + matched_str_length, string_length);
replace_ctx.index = JERRY_MIN (position + matched_str_length, string_length);
}
/* 16.m */
@ -1979,7 +2214,7 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
}
/* 18. */
JERRY_ASSERT (source_index <= string_length);
JERRY_ASSERT (replace_ctx.index <= string_length);
ecma_stringbuilder_append_raw (&(replace_ctx.builder),
source_position_p,
(lit_utf8_size_t) (string_end_p - source_position_p));
@ -1990,6 +2225,11 @@ ecma_regexp_replace_helper (ecma_value_t this_arg, /**< this argument */
cleanup_builder:
ecma_stringbuilder_destroy (&replace_ctx.builder);
if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED)
{
jmem_heap_free_block ((void *) replace_ctx.string_p, replace_ctx.string_size);
}
cleanup_results:
ecma_collection_free (results_p);
#endif /* !ENABLED (JERRY_ES2015) */
@ -2001,11 +2241,6 @@ cleanup_replace:
}
cleanup_string:
if (string_flags & ECMA_STRING_FLAG_MUST_BE_FREED)
{
jmem_heap_free_block ((void *) replace_ctx.string_p, replace_ctx.string_size);
}
ecma_deref_ecma_string (string_p);
return result;

View File

@ -101,8 +101,8 @@ ecma_value_t ecma_op_create_regexp_object_from_bytecode (re_compiled_code_t *byt
ecma_value_t ecma_op_create_regexp_object (ecma_string_t *pattern_p, uint16_t flags);
ecma_value_t ecma_regexp_exec_helper (ecma_value_t regexp_value, ecma_value_t input_string, bool ignore_global);
ecma_string_t *ecma_regexp_read_pattern_str_helper (ecma_value_t pattern_arg);
ecma_char_t ecma_regexp_canonicalize (ecma_char_t ch, bool is_ignorecase);
ecma_char_t ecma_regexp_canonicalize_char (ecma_char_t ch);
lit_code_point_t ecma_regexp_canonicalize (lit_code_point_t ch, bool is_ignorecase);
lit_code_point_t ecma_regexp_canonicalize_char (lit_code_point_t ch);
ecma_value_t ecma_regexp_parse_flags (ecma_string_t *flags_str_p, uint16_t *flags_p);
void ecma_regexp_initialize_props (ecma_object_t *re_obj_p, ecma_string_t *source_p, uint16_t flags);

View File

@ -338,7 +338,8 @@ LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SOURCE, "source")
#if ENABLED (JERRY_BUILTIN_ARRAY)
LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_SPLICE, "splice")
#endif
#if ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015)
#if ENABLED (JERRY_BUILTIN_REGEXP) && ENABLED (JERRY_ES2015) \
|| ENABLED (JERRY_BUILTIN_REGEXP) && !( !ENABLED (JERRY_ES2015))
LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_STICKY, "sticky")
#endif
LIT_MAGIC_STRING_DEF (LIT_MAGIC_STRING_STRING, "string")

View File

@ -2317,6 +2317,14 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */
{
flag = RE_FLAG_MULTILINE;
}
else if (source_p[0] == LIT_CHAR_LOWERCASE_U)
{
flag = RE_FLAG_UNICODE;
}
else if (source_p[0] == LIT_CHAR_LOWERCASE_Y)
{
flag = RE_FLAG_STICKY;
}
if (flag == 0)
{

View File

@ -15,6 +15,7 @@
#include "ecma-globals.h"
#include "re-bytecode.h"
#include "ecma-regexp-object.h"
#if ENABLED (JERRY_BUILTIN_REGEXP)
@ -455,8 +456,16 @@ re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
JERRY_DEBUG_MSG ("%d", num_of_class);
while (num_of_class)
{
JERRY_DEBUG_MSG (" %d", re_get_char (&bytecode_p));
JERRY_DEBUG_MSG ("-%d", re_get_char (&bytecode_p));
if ((compiled_code_p->header.status_flags & RE_FLAG_UNICODE) != 0)
{
JERRY_DEBUG_MSG (" %u", re_get_value (&bytecode_p));
JERRY_DEBUG_MSG ("-%u", re_get_value (&bytecode_p));
}
else
{
JERRY_DEBUG_MSG (" %u", re_get_char (&bytecode_p));
JERRY_DEBUG_MSG ("-%u", re_get_char (&bytecode_p));
}
num_of_class--;
}
JERRY_DEBUG_MSG (", ");

View File

@ -226,12 +226,29 @@ re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compile
*/
static void
re_append_char_class (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
ecma_char_t start, /**< character class range from */
ecma_char_t end) /**< character class range to */
lit_code_point_t start, /**< character class range from */
lit_code_point_t end) /**< character class range to */
{
re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
re_append_char (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
re_ctx_p->parser_ctx_p->classes_count++;
#if ENABLED (JERRY_ES2015)
if (re_ctx_p->flags & RE_FLAG_UNICODE)
{
re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (start, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
re_append_u32 (re_ctx_p->bytecode_ctx_p, ecma_regexp_canonicalize (end, re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
return;
}
#endif /* ENABLED (JERRY_ES2015) */
JERRY_ASSERT (start <= LIT_UTF16_CODE_UNIT_MAX);
JERRY_ASSERT (end <= LIT_UTF16_CODE_UNIT_MAX);
re_append_char (re_ctx_p->bytecode_ctx_p,
(ecma_char_t) ecma_regexp_canonicalize (start,
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
re_append_char (re_ctx_p->bytecode_ctx_p,
(ecma_char_t) ecma_regexp_canonicalize (end,
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
} /* re_append_char_class */
/**
@ -250,7 +267,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */
out_token_p->qmax = out_token_p->qmin = 1;
parser_ctx_p->classes_count = 0;
ecma_char_t start = LIT_CHAR_UNDEF;
lit_code_point_t start = LIT_CHAR_UNDEF;
bool is_range = false;
const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS
|| re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS);
@ -269,7 +286,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */
return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string"));
}
ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
lit_code_point_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
if (ch == LIT_CHAR_RIGHT_SQUARE)
{
@ -459,6 +476,20 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */
}
} /* ch == LIT_CHAR_BACKSLASH */
#if ENABLED (JERRY_ES2015)
if (re_ctx_p->flags & RE_FLAG_UNICODE
&& lit_is_code_point_utf16_high_surrogate (ch)
&& parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p)
{
const ecma_char_t next_ch = lit_utf8_peek_next (parser_ctx_p->input_curr_p);
if (lit_is_code_point_utf16_low_surrogate (next_ch))
{
ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch);
lit_utf8_incr (&parser_ctx_p->input_curr_p);
}
}
#endif /* ENABLED (JERRY_ES2015) */
if (start != LIT_CHAR_UNDEF)
{
if (is_range)
@ -559,8 +590,8 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context
(unsigned int) re_ctx_p->current_token.qmax);
re_append_opcode (bc_ctx_p, RE_OP_CHAR);
re_append_char (bc_ctx_p, ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
re_append_char (bc_ctx_p, (ecma_char_t) ecma_regexp_canonicalize ((ecma_char_t) re_ctx_p->current_token.value,
re_ctx_p->flags & RE_FLAG_IGNORE_CASE));
ret_value = re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
break;

View File

@ -0,0 +1,62 @@
// Copyright JS Foundation and other contributors, http://js.foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
var r = /[𐲡-𐲱𐲟]/u;
var m = r.exec("𐲬");
assert(m !== null);
assert(m[0] === "𐲬");
r = /[𐲡E]/ug;
assert (r.exec("E𐲡E")[0] === 'E');
assert (r.exec("E𐲡E")[0] === '𐲡');
assert (r.exec("E𐲡E")[0] === 'E');
try {
eval("/[𐲡-𐲱𐲟]/");
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
assert (/\udc96/.exec("\ud803\udc96")[0] === '\udc96');
assert (/\udc96/u.exec("\ud803\udc96") === null);
assert (/\udc96/u.exec("\udc96")[0] === '\udc96');
assert (/\ud803/.exec("\ud803\udc96")[0] === '\ud803');
assert (/\ud803/u.exec("\ud803\udc96") === null);
assert (/\ud803/u.exec("\ud803")[0] === '\ud803');
assert (/./u.exec("\ud803\udc96")[0] === '𐲖');
assert (/./.exec("\ud803\udc96")[0] === '\ud803');
assert (/./u.exec("\ud803\ud803")[0] === '\ud803');
assert (/./u.exec("\udc96\udc96")[0] === '\udc96');
assert (/./u.exec("\ud803")[0] === '\ud803');
var r = /abc/y;
m = r.exec ("strabcstr");
assert (m === null);
r.lastIndex = 3;
m = r.exec ("strabcstr");
assert (m[0] === "abc");
assert (r.lastIndex === 6);
m = r.exec ("strabcstr");
assert (m === null);
assert (r.lastIndex === 0);
var r = /abc/yg;
m = r.exec ("strabcstr");
assert (m === null);

View File

@ -84,3 +84,63 @@ try {
} catch (e) {
assert (e === "abrupt flags toString");
}
var o = {
global: true,
source: "str"
}
Object.defineProperty(o, 'unicode', { 'get': function () {throw "abrupt unicode get"; }});
try {
RegExp.prototype[Symbol.match].call(o, "str");
assert (false);
} catch (e) {
assert (e === "abrupt unicode get");
}
assert ("str𐲡fgh".replace(/(?:)/gu, "x") === 'xsxtxrx𐲡xfxgxhx');
assert ("str𐲡fgh".replace(/(?:)/g, "x") === 'xsxtxrx\ud803x\udca1xfxgxhx');
r = /(?:)/gu;
/* Disable fast path. */
r.exec = function (s) { return RegExp.prototype.exec.call(this, s); };
assert ("str𐲡fgh".replace(r, "x") === 'xsxtxrx𐲡xfxgxhx');
Object.defineProperty(r, 'unicode', {value: false});
assert ("str𐲡fgh".replace(r, "x") === 'xsxtxrx\ud803x\udca1xfxgxhx');
r = /(?:)/gu;
assert (RegExp.prototype[Symbol.match].call(r, "str𐲡fgh").length === 8);
Object.defineProperty(r, 'unicode', {value: false});
assert (RegExp.prototype[Symbol.match].call(r, "str𐲡fgh").length === 9);
r = /(?:)/gy;
r.lastIndex = 2;
assert ("asd".replace(r, "x") === "xaxsxdx");
assert (r.lastIndex === 0);
r.lastIndex = 5;
assert ("asd".replace(r, "x") === "xaxsxdx");
assert (r.lastIndex === 0);
r = /(?:)/y;
r.lastIndex = 2;
assert ("asd".replace(r, "x") === "asxd");
assert (r.lastIndex === 2);
r.lastIndex = 5;
assert ("asd".replace(r, "x") === "asd");
assert (r.lastIndex === 0);
r.lastIndex = 2;
/* Disable fast path. */
r.exec = function (s) { return RegExp.prototype.exec.call(this, s); };
assert ("asd".replace(r, "x") === "asxd");
assert (r.lastIndex === 2);
r.lastIndex = 5;
assert ("asd".replace(r, "x") === "asd");
assert (r.lastIndex === 0);
assert (RegExp.prototype[Symbol.match].call(/a/y, "aaa").length === 1);
assert (RegExp.prototype[Symbol.match].call(/a/gy, "aaa").length === 3);