From 3c71daaf84bc9e933e8723651e7ee7dce6232706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Lang=C3=B3?= Date: Wed, 22 Jul 2015 09:17:37 +0200 Subject: [PATCH] Use unicode iterators in RegExp engine and implement \d, \D, \w, \W, \s, \S, \x, \u matching in RegExp. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JerryScript-DCO-1.0-Signed-off-by: Szilard Ledan szledan.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com --- .../ecma-builtin-regexp-prototype.cpp | 8 +- .../builtin-objects/ecma-builtin-regexp.cpp | 11 +- .../ecma/operations/ecma-regexp-object.cpp | 321 ++++++------ .../ecma/operations/ecma-regexp-object.h | 5 +- jerry-core/parser/regexp/re-compiler.cpp | 25 +- jerry-core/parser/regexp/re-parser.cpp | 489 ++++++++++-------- jerry-core/parser/regexp/re-parser.h | 17 +- tests/jerry/regexp-character-class.js | 85 ++- 8 files changed, 559 insertions(+), 402 deletions(-) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.cpp index 49f42e0e5..c52025869 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.cpp @@ -79,14 +79,12 @@ ecma_builtin_regexp_prototype_exec (ecma_value_t this_arg, /**< this argument */ /* Convert ecma_String_t *to regexp_bytecode_t* */ lit_utf8_size_t input_str_size = ecma_string_get_size (input_str_p); - MEM_DEFINE_LOCAL_ARRAY (input_utf8_buffer_p, input_str_size + 1, lit_utf8_byte_t); + MEM_DEFINE_LOCAL_ARRAY (input_utf8_buffer_p, input_str_size, lit_utf8_byte_t); ecma_string_to_utf8_string (input_str_p, input_utf8_buffer_p, (ssize_t) input_str_size); + lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_utf8_buffer_p, input_str_size); - FIXME ("Update ecma_regexp_exec_helper so that zero symbol is not needed."); - input_utf8_buffer_p[input_str_size] = LIT_BYTE_NULL; - - ret_value = ecma_regexp_exec_helper (obj_p, bytecode_p, input_utf8_buffer_p, input_str_size); + ret_value = ecma_regexp_exec_helper (obj_p, bytecode_p, &iter); MEM_FINALIZE_LOCAL_ARRAY (input_utf8_buffer_p); diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.cpp index 31166a3bf..8a22c62ed 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.cpp @@ -119,7 +119,16 @@ ecma_builtin_regexp_dispatch_construct (const ecma_value_t *arguments_list_p, /* if (ecma_is_completion_value_empty (ret_value)) { - ret_value = ecma_op_create_regexp_object (pattern_string_p, flags_string_p); + if (ecma_string_get_length (pattern_string_p) == 0) + { + ecma_string_t *magic_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_EMPTY_NON_CAPTURE_GROUP); + ret_value = ecma_op_create_regexp_object (magic_str_p, flags_string_p); + ecma_deref_ecma_string (magic_str_p); + } + else + { + ret_value = ecma_op_create_regexp_object (pattern_string_p, flags_string_p); + } } if (flags_string_p != NULL) diff --git a/jerry-core/ecma/operations/ecma-regexp-object.cpp b/jerry-core/ecma/operations/ecma-regexp-object.cpp index e81312ddb..5ee73493d 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.cpp +++ b/jerry-core/ecma/operations/ecma-regexp-object.cpp @@ -75,17 +75,16 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - FIXME ("Unicode: properly process non-ascii characters."); lit_utf8_size_t flags_str_size = ecma_string_get_size (flags_str_p); MEM_DEFINE_LOCAL_ARRAY (flags_start_p, flags_str_size, lit_utf8_byte_t); ecma_string_to_utf8_string (flags_str_p, flags_start_p, (ssize_t) flags_str_size); - lit_utf8_byte_t *flags_char_p = flags_start_p; + lit_utf8_iterator_t iter = lit_utf8_iterator_create (flags_start_p, flags_str_size); - while (flags_char_p < flags_start_p + flags_str_size + while (!lit_utf8_iterator_is_eos (&iter) && ecma_is_completion_value_empty (ret_value)) { - switch (*flags_char_p) + switch (lit_utf8_iterator_read_next (&iter)) { case 'g': { @@ -120,7 +119,6 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags break; } } - flags_char_p++; } MEM_FINALIZE_LOCAL_ARRAY (flags_start_p); @@ -231,48 +229,6 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ return ret_value; } /* ecma_op_create_regexp_object */ -/** - * Backtrack a unicode character - */ -static const lit_utf8_byte_t * -re_utf8_backtrack (const lit_utf8_byte_t *str_p) -{ - /* FIXME: change to string iterator with unicode support, when it would be implemented */ - return --str_p; -} /* re_utf8_backtrack */ - -/** - * Helper to get an input character and increase string pointer. - */ -static ecma_char_t -re_get_input_char (const lit_utf8_byte_t **char_p) -{ - /* FIXME: change to string iterator with unicode support, when it would be implemented */ - const lit_utf8_byte_t ch = **char_p; - (*char_p)++; - return ch; -} /* re_get_input_char */ - -/** - * Helper to get current input character, won't increase string pointer. - */ -static ecma_char_t -re_lookup_input_char (const lit_utf8_byte_t *str_p) -{ - /* FIXME: change to string iterator with unicode support, when it would be implemented */ - return *str_p; -} /* re_lookup_input_char */ - -/** - * Helper to get previous input character, won't decrease string pointer. - */ -static ecma_char_t -re_lookup_prev_char (const lit_utf8_byte_t *str_p) -{ - /* FIXME: change to string iterator with unicode support, when it would be implemented */ - return *(--str_p); -} /* re_lookup_prev_char */ - /** * Recursive function for RegExp matching. Tests for a regular expression * match and returns a MatchResult value. @@ -286,8 +242,8 @@ re_lookup_prev_char (const lit_utf8_byte_t *str_p) static ecma_completion_value_t re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ re_bytecode_t *bc_p, /**< pointer to the current RegExp bytecode */ - const lit_utf8_byte_t *str_p, /**< pointer to the current input character */ - const lit_utf8_byte_t **res_p) /**< pointer to the matching substring */ + lit_utf8_iterator_t iter, /**< input string iterator */ + lit_utf8_iterator_t *out_iter_p) /**< Output: matching substring iterator */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); re_opcode_t op; @@ -313,37 +269,52 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_MATCH: { JERRY_DDLOG ("Execute RE_OP_MATCH: match\n"); - *res_p = str_p; + *out_iter_p = iter; re_ctx_p->recursion_depth--; ret_value = ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_TRUE); return ret_value; /* match */ } case RE_OP_CHAR: { - uint32_t ch1 = re_get_value (&bc_p); - uint32_t ch2 = re_get_input_char (&str_p); + if (lit_utf8_iterator_is_eos (&iter)) + { + re_ctx_p->recursion_depth--; + return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ + } + + ecma_char_t ch1 = (ecma_char_t) re_get_value (&bc_p); + ecma_char_t ch2 = lit_utf8_iterator_read_next (&iter); JERRY_DDLOG ("Character matching %d to %d: ", ch1, ch2); - if (ch2 == '\0' || ch1 != ch2) + if (ch1 != ch2) { JERRY_DDLOG ("fail\n"); re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } + JERRY_DDLOG ("match\n"); + break; /* tail merge */ } case RE_OP_PERIOD: { - uint32_t ch1 = re_get_input_char (&str_p); - JERRY_DDLOG ("Period matching '.' to %d: ", ch1); + if (lit_utf8_iterator_is_eos (&iter)) + { + re_ctx_p->recursion_depth--; + return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ + } - if (ch1 == '\n' || ch1 == '\0') + ecma_char_t ch = lit_utf8_iterator_read_next (&iter); + JERRY_DDLOG ("Period matching '.' to %d: ", (uint32_t) ch); + + if (lit_char_is_line_terminator (ch)) { JERRY_DDLOG ("fail\n"); re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } + JERRY_DDLOG ("match\n"); break; /* tail merge */ } @@ -351,7 +322,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { JERRY_DDLOG ("Execute RE_OP_ASSERT_START: "); - if (str_p <= re_ctx_p->input_start_p) + if ((iter.buf_p + iter.buf_pos.offset) <= re_ctx_p->input_start_p) { JERRY_DDLOG ("match\n"); break; @@ -364,7 +335,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - if (lit_char_is_line_terminator (re_lookup_prev_char (str_p))) + if (lit_char_is_line_terminator (lit_utf8_iterator_peek_prev (&iter))) { JERRY_DDLOG ("match\n"); break; @@ -378,7 +349,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { JERRY_DDLOG ("Execute RE_OP_ASSERT_END: "); - if (str_p >= re_ctx_p->input_end_p) + if ((iter.buf_p + iter.buf_pos.offset) >= re_ctx_p->input_end_p) { JERRY_DDLOG ("match\n"); break; /* tail merge */ @@ -391,7 +362,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - if (lit_char_is_line_terminator (re_lookup_input_char (str_p))) + if (lit_char_is_line_terminator (lit_utf8_iterator_peek_next (&iter))) { JERRY_DDLOG ("match\n"); break; /* tail merge */ @@ -406,27 +377,27 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { bool is_wordchar_left, is_wordchar_right; - if (str_p <= re_ctx_p->input_start_p) + if ((iter.buf_p + iter.buf_pos.offset) <= re_ctx_p->input_start_p) { is_wordchar_left = false; /* not a wordchar */ } else { - is_wordchar_left = lit_char_is_word_char (re_lookup_prev_char (str_p)); + is_wordchar_left = lit_char_is_word_char (lit_utf8_iterator_peek_prev (&iter)); } - if (str_p >= re_ctx_p->input_end_p) + if ((iter.buf_p + iter.buf_pos.offset) >= re_ctx_p->input_end_p) { is_wordchar_right = false; /* not a wordchar */ } else { - is_wordchar_right = lit_char_is_word_char (re_lookup_input_char (str_p)); + is_wordchar_right = lit_char_is_word_char (lit_utf8_iterator_peek_next (&iter)); } if (op == RE_OP_ASSERT_WORD_BOUNDARY) { - JERRY_DDLOG ("Execute RE_OP_ASSERT_WORD_BOUNDARY at %c: ", *str_p); + JERRY_DDLOG ("Execute RE_OP_ASSERT_WORD_BOUNDARY: "); if (is_wordchar_left == is_wordchar_right) { JERRY_DDLOG ("fail\n"); @@ -437,7 +408,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ else { JERRY_ASSERT (op == RE_OP_ASSERT_NOT_WORD_BOUNDARY); - JERRY_DDLOG ("Execute RE_OP_ASSERT_NOT_WORD_BOUNDARY at %c: ", *str_p); + JERRY_DDLOG ("Execute RE_OP_ASSERT_NOT_WORD_BOUNDARY: "); if (is_wordchar_left != is_wordchar_right) { @@ -454,19 +425,21 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_LOOKAHEAD_NEG: { ecma_completion_value_t match_value = ecma_make_empty_completion_value (); - const lit_utf8_byte_t *sub_str_p = NULL; + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); - MEM_DEFINE_LOCAL_ARRAY (saved_bck_p, re_ctx_p->num_of_captures, lit_utf8_byte_t *); - size_t size = (size_t) (re_ctx_p->num_of_captures) * sizeof (const lit_utf8_byte_t *); + uint32_t array_size = re_ctx_p->num_of_captures + re_ctx_p->num_of_non_captures; + MEM_DEFINE_LOCAL_ARRAY (saved_bck_p, array_size, lit_utf8_iterator_t); + + size_t size = (size_t) (array_size) * sizeof (lit_utf8_iterator_t); memcpy (saved_bck_p, re_ctx_p->saved_p, size); do { uint32_t offset = re_get_value (&bc_p); - if (!sub_str_p) + if (!sub_iter.buf_p) { - match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_completion_value_throw (match_value)) { break; @@ -480,11 +453,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { JERRY_DDLOG ("Execute RE_OP_LOOKAHEAD_POS/NEG: "); ecma_free_completion_value (match_value); - if ((op == RE_OP_LOOKAHEAD_POS && sub_str_p) - || (op == RE_OP_LOOKAHEAD_NEG && !sub_str_p)) + if ((op == RE_OP_LOOKAHEAD_POS && sub_iter.buf_p) + || (op == RE_OP_LOOKAHEAD_NEG && !sub_iter.buf_p)) { JERRY_DDLOG ("match\n"); - match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); } else { @@ -499,7 +472,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; } else { @@ -519,15 +492,14 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ bool is_match; JERRY_DDLOG ("Execute RE_OP_CHAR_CLASS/RE_OP_INV_CHAR_CLASS, "); - - if (str_p >= re_ctx_p->input_end_p) + if (lit_utf8_iterator_is_eos (&iter)) { JERRY_DDLOG ("fail\n"); re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - curr_ch = re_get_input_char (&str_p); + curr_ch = lit_utf8_iterator_read_next (&iter); num_of_ranges = re_get_value (&bc_p); is_match = false; @@ -573,34 +545,33 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_BACKREFERENCE: { uint32_t backref_idx; - const lit_utf8_byte_t *sub_str_p; backref_idx = re_get_value (&bc_p); JERRY_DDLOG ("Execute RE_OP_BACKREFERENCE (idx: %d): ", backref_idx); backref_idx *= 2; /* backref n -> saved indices [n*2, n*2+1] */ JERRY_ASSERT (backref_idx >= 2 && backref_idx + 1 < re_ctx_p->num_of_captures); - if (!re_ctx_p->saved_p[backref_idx] || !re_ctx_p->saved_p[backref_idx + 1]) + if (!re_ctx_p->saved_p[backref_idx].buf_p || !re_ctx_p->saved_p[backref_idx + 1].buf_p) { JERRY_DDLOG ("match\n"); break; /* capture is 'undefined', always matches! */ } - sub_str_p = re_ctx_p->saved_p[backref_idx]; + lit_utf8_iterator_t sub_iter = re_ctx_p->saved_p[backref_idx]; - while (sub_str_p < re_ctx_p->saved_p[backref_idx + 1]) + while (sub_iter.buf_pos.offset < re_ctx_p->saved_p[backref_idx + 1].buf_pos.offset) { - uint32_t ch1, ch2; + ecma_char_t ch1, ch2; - if (str_p >= re_ctx_p->input_end_p) + if ((iter.buf_p + iter.buf_pos.offset) >= re_ctx_p->input_end_p) { JERRY_DDLOG ("fail\n"); re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - ch1 = re_get_input_char (&sub_str_p); - ch2 = re_get_input_char (&str_p); + ch1 = lit_utf8_iterator_read_next (&sub_iter); + ch2 = lit_utf8_iterator_read_next (&iter); if (ch1 != ch2) { @@ -614,21 +585,20 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } case RE_OP_SAVE_AT_START: { - const lit_utf8_byte_t *old_start_p; re_bytecode_t *old_bc_p; JERRY_DDLOG ("Execute RE_OP_SAVE_AT_START\n"); - old_start_p = re_ctx_p->saved_p[RE_GLOBAL_START_IDX]; - re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = str_p; + lit_utf8_iterator_t old_start_p = re_ctx_p->saved_p[RE_GLOBAL_START_IDX]; + re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = iter; do { uint32_t offset = re_get_value (&bc_p); - const lit_utf8_byte_t *sub_str_p; - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -649,8 +619,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_SAVE_AND_MATCH: { JERRY_DDLOG ("End of pattern is reached: match\n"); - re_ctx_p->saved_p[RE_GLOBAL_END_IDX] = str_p; - *res_p = str_p; + re_ctx_p->saved_p[RE_GLOBAL_END_IDX] = iter; + *out_iter_p = iter; re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_TRUE); /* match */ } @@ -682,8 +652,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ * after the group first, if zero iteration is allowed. */ uint32_t start_idx, iter_idx, offset; - const lit_utf8_byte_t *old_start_p; - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t old_start = lit_utf8_iterator_create (NULL, 0); + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); re_bytecode_t *old_bc_p; old_bc_p = bc_p; /* save the bytecode start position of the group start */ @@ -696,8 +666,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ iter_idx = start_idx - 1; start_idx *= 2; - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_p; + old_start = re_ctx_p->saved_p[start_idx]; + re_ctx_p->saved_p[start_idx] = iter; } else { @@ -711,11 +681,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ bc_p += offset; /* Try to match after the close paren if zero is allowed */ - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -725,7 +695,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } if (RE_IS_CAPTURE_GROUP (op)) { - re_ctx_p->saved_p[start_idx] = old_start_p; + re_ctx_p->saved_p[start_idx] = old_start; } bc_p = old_bc_p; @@ -737,8 +707,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START: { uint32_t start_idx, iter_idx, old_iteration_cnt, offset; - const lit_utf8_byte_t *old_start_p; - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); re_bytecode_t *old_bc_p; re_bytecode_t *end_bc_p = NULL; start_idx = re_get_value (&bc_p); @@ -762,18 +731,19 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ iter_idx = start_idx + (re_ctx_p->num_of_captures / 2) - 1; start_idx += re_ctx_p->num_of_captures; } - old_start_p = re_ctx_p->saved_p[start_idx]; + + lit_utf8_iterator_t old_start = re_ctx_p->saved_p[start_idx]; old_iteration_cnt = re_ctx_p->num_of_iterations_p[iter_idx]; - re_ctx_p->saved_p[start_idx] = str_p; + re_ctx_p->saved_p[start_idx] = iter; re_ctx_p->num_of_iterations_p[iter_idx] = 0; do { offset = re_get_value (&bc_p); - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -793,11 +763,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ || op == RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START) { JERRY_ASSERT (end_bc_p); - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, end_bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, end_bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -807,7 +777,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } } - re_ctx_p->saved_p[start_idx] = old_start_p; + re_ctx_p->saved_p[start_idx] = old_start; re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } @@ -815,7 +785,6 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END: { uint32_t end_idx, iter_idx, min, max; - const lit_utf8_byte_t *old_end_p; re_bytecode_t *old_bc_p; /* @@ -846,14 +815,14 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (re_ctx_p->num_of_iterations_p[iter_idx] >= min && re_ctx_p->num_of_iterations_p[iter_idx] <= max) { - old_end_p = re_ctx_p->saved_p[end_idx]; - re_ctx_p->saved_p[end_idx] = str_p; + lit_utf8_iterator_t old_end = re_ctx_p->saved_p[end_idx]; + re_ctx_p->saved_p[end_idx] = iter; - const lit_utf8_byte_t *sub_str_p; - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -862,7 +831,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return match_value; } - re_ctx_p->saved_p[end_idx] = old_end_p; + re_ctx_p->saved_p[end_idx] = old_end; } re_ctx_p->num_of_iterations_p[iter_idx]--; bc_p = old_bc_p; @@ -874,9 +843,9 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_NON_CAPTURE_GREEDY_GROUP_END: { uint32_t start_idx, end_idx, iter_idx, min, max, offset; - const lit_utf8_byte_t *old_start_p; - const lit_utf8_byte_t *old_end_p; - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t old_start = lit_utf8_iterator_create (NULL, 0); + lit_utf8_iterator_t old_end = lit_utf8_iterator_create (NULL, 0); + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); re_bytecode_t *old_bc_p; end_idx = re_get_value (&bc_p); @@ -901,7 +870,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ /* Check the empty iteration if the minimum number of iterations is reached. */ if (re_ctx_p->num_of_iterations_p[iter_idx] >= min - && str_p == re_ctx_p->saved_p[start_idx]) + && iter.buf_p == re_ctx_p->saved_p[start_idx].buf_p + && iter.buf_pos.offset == re_ctx_p->saved_p[start_idx].buf_pos.offset) { re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ @@ -910,21 +880,21 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ re_ctx_p->num_of_iterations_p[iter_idx]++; old_bc_p = bc_p; /* Save the bytecode end position of the END opcodes for matching after it. */ - old_end_p = re_ctx_p->saved_p[end_idx]; - re_ctx_p->saved_p[end_idx] = str_p; + old_end = re_ctx_p->saved_p[end_idx]; + re_ctx_p->saved_p[end_idx] = iter; if (re_ctx_p->num_of_iterations_p[iter_idx] < max) { bc_p -= offset; offset = re_get_value (&bc_p); - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_p; - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + old_start = re_ctx_p->saved_p[start_idx]; + re_ctx_p->saved_p[start_idx] = iter; + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -933,7 +903,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return match_value; } - re_ctx_p->saved_p[start_idx] = old_start_p; + re_ctx_p->saved_p[start_idx] = old_start; /* Try to match alternatives if any. */ bc_p += offset; @@ -942,14 +912,14 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ bc_p++; /* RE_OP_ALTERNATIVE */ offset = re_get_value (&bc_p); - old_start_p = re_ctx_p->saved_p[start_idx]; - re_ctx_p->saved_p[start_idx] = str_p; + old_start = re_ctx_p->saved_p[start_idx]; + re_ctx_p->saved_p[start_idx] = iter; - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -958,7 +928,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return match_value; } - re_ctx_p->saved_p[start_idx] = old_start_p; + re_ctx_p->saved_p[start_idx] = old_start; bc_p += offset; } } @@ -967,11 +937,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ && re_ctx_p->num_of_iterations_p[iter_idx] <= max) { /* Try to match the rest of the bytecode. */ - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, old_bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, old_bc_p, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -982,7 +952,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } /* restore if fails */ - re_ctx_p->saved_p[end_idx] = old_end_p; + re_ctx_p->saved_p[end_idx] = old_end; re_ctx_p->num_of_iterations_p[iter_idx]--; re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ @@ -990,7 +960,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_NON_GREEDY_ITERATOR: { uint32_t min, max, offset, num_of_iter; - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); min = re_get_value (&bc_p); max = re_get_value (&bc_p); @@ -1004,11 +974,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { if (num_of_iter >= min) { - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -1018,7 +988,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } } - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); if (!ecma_is_value_true (match_value)) { @@ -1029,7 +999,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ break; } - str_p = sub_str_p; + + iter = sub_iter; num_of_iter++; } re_ctx_p->recursion_depth--; @@ -1038,7 +1009,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_GREEDY_ITERATOR: { uint32_t min, max, offset, num_of_iter; - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); min = re_get_value (&bc_p); max = re_get_value (&bc_p); @@ -1051,7 +1022,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ while (num_of_iter < max) { - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, iter, &sub_iter); + if (!ecma_is_value_true (match_value)) { if (ecma_is_completion_value_throw (match_value)) @@ -1061,17 +1033,18 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ break; } - str_p = sub_str_p; + + iter = sub_iter; num_of_iter++; } while (num_of_iter >= min) { - ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p); + ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, iter, &sub_iter); if (ecma_is_value_true (match_value)) { - *res_p = sub_str_p; + *out_iter_p = sub_iter; re_ctx_p->recursion_depth--; return match_value; /* match */ } @@ -1085,7 +1058,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ break; } - str_p = re_utf8_backtrack (str_p); + lit_utf8_iterator_read_prev (&iter); num_of_iter--; } re_ctx_p->recursion_depth--; @@ -1205,14 +1178,12 @@ re_set_result_array_properties (ecma_object_t *array_obj_p, /**< result array */ ecma_completion_value_t ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ re_bytecode_t *bc_p, /**< start of the RegExp bytecode */ - const lit_utf8_byte_t *str_p, /**< start of the input string */ - lit_utf8_size_t str_size) /**< size of the input string */ + lit_utf8_iterator_t *iter_p) /**< input string iterator */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - lit_utf8_size_t input_size = str_size; re_matcher_ctx_t re_ctx; - re_ctx.input_start_p = str_p; - re_ctx.input_end_p = str_p + str_size; + re_ctx.input_start_p = iter_p->buf_p; + re_ctx.input_end_p = iter_p->buf_p + iter_p->buf_size; re_ctx.match_limit = 0; re_ctx.recursion_depth = 0; @@ -1227,11 +1198,11 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ JERRY_ASSERT (re_ctx.num_of_captures % 2 == 0); re_ctx.num_of_non_captures = re_get_value (&bc_p); - MEM_DEFINE_LOCAL_ARRAY (saved_p, re_ctx.num_of_captures + re_ctx.num_of_non_captures, const lit_utf8_byte_t *); + MEM_DEFINE_LOCAL_ARRAY (saved_p, re_ctx.num_of_captures + re_ctx.num_of_non_captures, lit_utf8_iterator_t); for (uint32_t i = 0; i < re_ctx.num_of_captures + re_ctx.num_of_non_captures; i++) { - saved_p[i] = NULL; + saved_p[i] = lit_utf8_iterator_create (NULL, 0); } re_ctx.saved_p = saved_p; @@ -1246,24 +1217,30 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ bool is_match = false; re_ctx.num_of_iterations_p = num_of_iter_p; int32_t index = 0; + ecma_length_t input_str_len = lit_utf8_string_length (iter_p->buf_p, iter_p->buf_size); - if (re_ctx.flags & RE_FLAG_GLOBAL) + if (iter_p->buf_p && re_ctx.flags & RE_FLAG_GLOBAL) { ecma_string_t *magic_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); ecma_property_t *lastindex_prop_p = ecma_op_object_get_property (obj_p, magic_str_p); ecma_number_t *lastindex_num_p = ecma_get_number_from_value (lastindex_prop_p->u.named_data_property.value); index = ecma_number_to_int32 (*lastindex_num_p); - JERRY_ASSERT (str_p != NULL); - str_p += ecma_number_to_int32 (*lastindex_num_p); + + JERRY_ASSERT (iter_p->buf_pos.offset == 0 && !iter_p->buf_pos.is_non_bmp_middle); + if (!lit_utf8_iterator_is_eos (iter_p) + && *lastindex_num_p <= input_str_len) + { + lit_utf8_iterator_advance (iter_p, (ecma_length_t) *lastindex_num_p); + } ecma_deref_ecma_string (magic_str_p); } /* 2. Try to match */ - const lit_utf8_byte_t *sub_str_p; + lit_utf8_iterator_t sub_iter = lit_utf8_iterator_create (NULL, 0); - while (str_p && str_p <= re_ctx.input_end_p && ecma_is_completion_value_empty (ret_value)) + while (ecma_is_completion_value_empty (ret_value)) { - if (index < 0 || index > (int32_t) input_size) + if (index < 0 || index > (int32_t) input_str_len) { ecma_string_t *magic_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); ecma_number_t *lastindex_num_p = ecma_alloc_number (); @@ -1277,8 +1254,7 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ } else { - sub_str_p = NULL; - ECMA_TRY_CATCH (match_value, re_match_regexp (&re_ctx, bc_p, str_p, &sub_str_p), ret_value); + ECMA_TRY_CATCH (match_value, re_match_regexp (&re_ctx, bc_p, *iter_p, &sub_iter), ret_value); if (ecma_is_value_true (match_value)) { @@ -1286,17 +1262,21 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ break; } - str_p++; + if (!lit_utf8_iterator_is_eos (iter_p)) + { + lit_utf8_iterator_advance (iter_p, 1); + } index++; + ECMA_FINALIZE (match_value); } } - if (re_ctx.flags & RE_FLAG_GLOBAL) + if (iter_p->buf_p && re_ctx.flags & RE_FLAG_GLOBAL) { ecma_string_t *magic_str_p = ecma_get_magic_string (LIT_MAGIC_STRING_LASTINDEX_UL); ecma_number_t *lastindex_num_p = ecma_alloc_number (); - *lastindex_num_p = ((ecma_number_t) (sub_str_p - re_ctx.input_start_p)); + *lastindex_num_p = sub_iter.buf_pos.offset; ecma_op_object_put (obj_p, magic_str_p, ecma_make_number_value (lastindex_num_p), true); ecma_dealloc_number (lastindex_num_p); ecma_deref_ecma_string (magic_str_p); @@ -1316,14 +1296,19 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ { ecma_string_t *index_str_p = ecma_new_ecma_string_from_uint32 (i / 2); - if (re_ctx.saved_p[i] && re_ctx.saved_p[i + 1] && re_ctx.saved_p[i + 1] >= re_ctx.saved_p[i]) + /* Note: 'iter_p->buf_p == NULL' means the input is empty string */ + if (((re_ctx.saved_p[i].buf_p && re_ctx.saved_p[i + 1].buf_p) + || (!iter_p->buf_p && !re_ctx.saved_p[i].buf_p && !re_ctx.saved_p[i + 1].buf_p)) + && re_ctx.saved_p[i + 1].buf_pos.offset >= re_ctx.saved_p[i].buf_pos.offset) { - ecma_length_t capture_str_len = static_cast (re_ctx.saved_p[i + 1] - re_ctx.saved_p[i]); + ecma_length_t capture_str_len; + capture_str_len = (ecma_length_t) re_ctx.saved_p[i + 1].buf_pos.offset - re_ctx.saved_p[i].buf_pos.offset; ecma_string_t *capture_str_p; if (capture_str_len > 0) { - capture_str_p = ecma_new_ecma_string_from_utf8 (re_ctx.saved_p[i], capture_str_len); + const lit_utf8_byte_t *utf8_str_p = re_ctx.saved_p[i].buf_p + re_ctx.saved_p[i].buf_pos.offset; + capture_str_p = ecma_new_ecma_string_from_utf8 (utf8_str_p, capture_str_len); } else { diff --git a/jerry-core/ecma/operations/ecma-regexp-object.h b/jerry-core/ecma/operations/ecma-regexp-object.h index 3a0eec29b..7293ee811 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.h +++ b/jerry-core/ecma/operations/ecma-regexp-object.h @@ -44,7 +44,7 @@ */ typedef struct { - const lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */ + lit_utf8_iterator_t *saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */ const lit_utf8_byte_t *input_start_p; /**< start of input pattern string */ const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */ uint32_t match_limit; /**< matching limit counter */ @@ -61,8 +61,7 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, ecma_string_t *flags_str extern ecma_completion_value_t ecma_regexp_exec_helper (ecma_object_t *obj_p, re_bytecode_t *bc_p, - const lit_utf8_byte_t *str_p, - lit_utf8_size_t str_size); + lit_utf8_iterator_t *iter_p); /** * @} diff --git a/jerry-core/parser/regexp/re-compiler.cpp b/jerry-core/parser/regexp/re-compiler.cpp index 92efca5c2..9c924c88d 100644 --- a/jerry-core/parser/regexp/re-compiler.cpp +++ b/jerry-core/parser/regexp/re-compiler.cpp @@ -554,14 +554,20 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); break; } + case RE_TOK_DIGIT: + case RE_TOK_NOT_DIGIT: + case RE_TOK_WHITE: + case RE_TOK_NOT_WHITE: + case RE_TOK_WORD_CHAR: + case RE_TOK_NOT_WORD_CHAR: case RE_TOK_START_CHAR_CLASS: case RE_TOK_START_INV_CHAR_CLASS: { JERRY_DDLOG ("Compile a character class\n"); re_append_opcode (bc_ctx_p, - re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS - ? RE_OP_CHAR_CLASS - : RE_OP_INV_CHAR_CLASS); + re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS + ? RE_OP_INV_CHAR_CLASS + : RE_OP_CHAR_CLASS); uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); ECMA_TRY_CATCH (empty, @@ -578,6 +584,11 @@ re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context } ECMA_FINALIZE (empty); + + if (ecma_is_completion_value_throw (ret_value)) + { + return ret_value; /* error */ + } break; } case RE_TOK_END_GROUP: @@ -648,15 +659,13 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ re_ctx.bytecode_ctx_p = &bc_ctx; lit_utf8_size_t pattern_str_size = ecma_string_get_size (pattern_str_p); - MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_size + 1, lit_utf8_byte_t); + MEM_DEFINE_LOCAL_ARRAY (pattern_start_p, pattern_str_size, lit_utf8_byte_t); ecma_string_to_utf8_string (pattern_str_p, pattern_start_p, (ssize_t) pattern_str_size); - FIXME ("Update regexp compiler so that zero symbol is not needed."); - pattern_start_p[pattern_str_size] = LIT_BYTE_NULL; + lit_utf8_iterator_t iter = lit_utf8_iterator_create (pattern_start_p, pattern_str_size); re_parser_ctx_t parser_ctx; - parser_ctx.pattern_start_p = pattern_start_p; - parser_ctx.current_char_p = pattern_start_p; + parser_ctx.iter = iter; parser_ctx.num_of_groups = -1; re_ctx.parser_ctx_p = &parser_ctx; diff --git a/jerry-core/parser/regexp/re-parser.cpp b/jerry-core/parser/regexp/re-parser.cpp index 722a6f0da..5287d7d97 100644 --- a/jerry-core/parser/regexp/re-parser.cpp +++ b/jerry-core/parser/regexp/re-parser.cpp @@ -19,31 +19,36 @@ #include "ecma-try-catch-macro.h" #include "jrt-libc-includes.h" #include "lit-char-helpers.h" +#include "re-compiler.h" #include "re-parser.h" #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN -/* FIXME: change it, when unicode support would be implemented */ -#define RE_LOOKUP(str_p, lookup) ((lit_zt_utf8_string_size (str_p) > (lookup)) \ - ? (ecma_char_t) str_p[lookup] \ - : (ecma_char_t) '\0') - -/* FIXME: change it, when unicode support would be implemented */ -#define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0) - /** - * Get next input character - * - * @return ecma_char_t - */ + * Lookup a character in the input string. + * + * @return unicode codeunit + */ static ecma_char_t -re_get_ecma_char (lit_utf8_byte_t **char_p) /**< pointer of input string */ +re_lookup (lit_utf8_iterator_t iter, /**< input string iterator */ + uint32_t lookup) /**< size of lookup */ { - /* FIXME: change to string iterator with unicode support, when it would be implemented */ - ecma_char_t ch = **char_p; - RE_ADVANCE (*char_p, 1); + ecma_char_t ch = 0; + for (uint32_t i = 0; i <= lookup; i++) + { + if (!lit_utf8_iterator_is_eos (&iter)) + { + ch = lit_utf8_iterator_read_next (&iter); + } + else + { + ch = 0; + break; + } + } + return ch; -} /* re_get_ecma_char */ +} /* re_lookup */ /** * Parse RegExp iterators @@ -52,24 +57,24 @@ re_get_ecma_char (lit_utf8_byte_t **char_p) /**< pointer of input string */ * Returned value must be freed with ecma_free_completion_value */ static ecma_completion_value_t -re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ +re_parse_iterator (lit_utf8_iterator_t iter, /**< RegExp pattern */ re_token_t *re_token_p, /**< output token */ uint32_t lookup, /**< size of lookup */ uint32_t *advance_p) /**< output length of current advance */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - ecma_char_t ch0 = RE_LOOKUP (pattern_p, lookup); - ecma_char_t ch1 = RE_LOOKUP (pattern_p, lookup + 1); + ecma_char_t ch0 = re_lookup (iter, lookup); + ecma_char_t ch1 = re_lookup (iter, lookup + 1); switch (ch0) { - case '?': + case LIT_CHAR_QUESTION: { re_token_p->qmin = 0; re_token_p->qmax = 1; - if (ch1 == '?') + if (ch1 == LIT_CHAR_QUESTION) { *advance_p = 2; re_token_p->greedy = false; @@ -81,12 +86,12 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ } break; } - case '*': + case LIT_CHAR_ASTERISK: { re_token_p->qmin = 0; re_token_p->qmax = RE_ITERATOR_INFINITE; - if (ch1 == '?') + if (ch1 == LIT_CHAR_QUESTION) { *advance_p = 2; re_token_p->greedy = false; @@ -98,12 +103,12 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ } break; } - case '+': + case LIT_CHAR_PLUS: { re_token_p->qmin = 1; re_token_p->qmax = RE_ITERATOR_INFINITE; - if (ch1 == '?') + if (ch1 == LIT_CHAR_QUESTION) { *advance_p = 2; re_token_p->greedy = false; @@ -115,7 +120,7 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ } break; } - case '{': + case LIT_CHAR_LEFT_BRACE: { uint32_t qmin = 0; uint32_t qmax = RE_ITERATOR_INFINITE; @@ -124,7 +129,7 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ while (true) { (*advance_p)++; - ch1 = RE_LOOKUP (pattern_p, lookup + *advance_p); + ch1 = re_lookup (iter, lookup + *advance_p); if (lit_char_is_decimal_digit (ch1)) { @@ -136,14 +141,14 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ digits++; qmin = qmin * 10 + lit_char_hex_to_int (ch1); } - else if (ch1 == ',') + else if (ch1 == LIT_CHAR_COMMA) { if (qmax != RE_ITERATOR_INFINITE) { ret_value = ecma_raise_syntax_error ("RegExp quantifier error: double comma."); return ret_value; } - if ((RE_LOOKUP (pattern_p, lookup + *advance_p + 1)) == '}') + if ((re_lookup (iter, lookup + *advance_p + 1)) == LIT_CHAR_RIGHT_BRACE) { if (digits == 0) { @@ -160,7 +165,7 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ qmin = 0; digits = 0; } - else if (ch1 == '}') + else if (ch1 == LIT_CHAR_RIGHT_BRACE) { if (digits == 0) { @@ -189,7 +194,7 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ } } - if ((RE_LOOKUP (pattern_p, lookup + *advance_p)) == '?') + if ((re_lookup (iter, lookup + *advance_p)) == LIT_CHAR_QUESTION) { re_token_p->greedy = false; *advance_p += 1; @@ -228,29 +233,28 @@ re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ static void re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser context */ { - lit_utf8_byte_t *pattern_p = parser_ctx_p->pattern_start_p; - ecma_char_t ch1; int char_class_in = 0; parser_ctx_p->num_of_groups = 0; - ch1 = re_get_ecma_char (&pattern_p); + lit_utf8_iterator_t iter = lit_utf8_iterator_create (parser_ctx_p->iter.buf_p, + parser_ctx_p->iter.buf_size); - while (ch1 != LIT_CHAR_NULL) + while (!lit_utf8_iterator_is_eos (&iter)) { - ecma_char_t ch0 = ch1; - ch1 = re_get_ecma_char (&pattern_p); + ecma_char_t ch0 = lit_utf8_iterator_read_next (&iter); + switch (ch0) { - case '\\': + case LIT_CHAR_BACKSLASH: { - ch1 = re_get_ecma_char (&pattern_p); + lit_utf8_iterator_advance (&iter, 1); break; } - case '[': + case LIT_CHAR_LEFT_SQUARE: { char_class_in++; break; } - case ']': + case LIT_CHAR_RIGHT_SQUARE: { if (!char_class_in) { @@ -258,9 +262,11 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex } break; } - case '(': + case LIT_CHAR_LEFT_PAREN: { - if (ch1 != '?' && !char_class_in) + if (!lit_utf8_iterator_is_eos (&iter) + && lit_utf8_iterator_peek_next (&iter) != LIT_CHAR_QUESTION + && !char_class_in) { parser_ctx_p->num_of_groups++; } @@ -285,18 +291,24 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ re_token_t *out_token_p) /**< output token */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - lit_utf8_byte_t **pattern_p = &(parser_ctx_p->current_char_p); + re_token_type_t token_type = ((re_compiler_ctx_t *) re_ctx_p)->current_token.type; out_token_p->qmax = out_token_p->qmin = 1; - ecma_char_t start = RE_CHAR_UNDEF; + uint32_t start = RE_CHAR_UNDEF; bool is_range = false; parser_ctx_p->num_of_classes = 0; do { - ecma_char_t ch = re_get_ecma_char (pattern_p); + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) + { + ret_value = ecma_raise_syntax_error ("invalid character class"); + return ret_value; + } - if (ch == ']') + uint32_t ch = lit_utf8_iterator_read_next (&(parser_ctx_p->iter)); + + if (ch == LIT_CHAR_RIGHT_SQUARE) { if (start != RE_CHAR_UNDEF) { @@ -304,129 +316,171 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ } break; } - else if (ch == '-') + else if (ch == LIT_CHAR_MINUS) { - if (start != RE_CHAR_UNDEF && !is_range && RE_LOOKUP (*pattern_p, 0) != ']') + if (start != RE_CHAR_UNDEF && !is_range && re_lookup (parser_ctx_p->iter, 0) != LIT_CHAR_RIGHT_SQUARE) { is_range = true; continue; } } - else if (ch == '\\') + else if (ch == LIT_CHAR_BACKSLASH) { - ch = re_get_ecma_char (pattern_p); + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) + { + ret_value = ecma_raise_syntax_error ("invalid character class"); + return ret_value; + } - if (ch == 'b') + ch = lit_utf8_iterator_read_next (&(parser_ctx_p->iter)); + + if (ch == LIT_CHAR_LOWERCASE_B) { - ch = RE_CONTROL_CHAR_BEL; + ch = LIT_CHAR_BS; } - else if (ch == 'f') + else if (ch == LIT_CHAR_LOWERCASE_F) { - ch = RE_CONTROL_CHAR_FF; + ch = LIT_CHAR_FF; } - else if (ch == 'n') + else if (ch == LIT_CHAR_LOWERCASE_N) { - ch = RE_CONTROL_CHAR_EOL; + ch = LIT_CHAR_LF; } - else if (ch == 't') + else if (ch == LIT_CHAR_LOWERCASE_T) { - ch = RE_CONTROL_CHAR_TAB; + ch = LIT_CHAR_TAB; } - else if (ch == 'r') + else if (ch == LIT_CHAR_LOWERCASE_R) { - ch = RE_CONTROL_CHAR_CR; + ch = LIT_CHAR_CR; } - else if (ch == 'v') + else if (ch == LIT_CHAR_LOWERCASE_V) { - ch = RE_CONTROL_CHAR_VT; + ch = LIT_CHAR_VTAB; } - else if (ch == 'c') + else if (ch == LIT_CHAR_LOWERCASE_C) { - ch = re_get_ecma_char (pattern_p); - if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) { - ch = (ch % 32); - } - else - { - ret_value = ecma_raise_syntax_error ("invalid regexp control escape"); + ret_value = ecma_raise_syntax_error ("decode error"); return ret_value; } + + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) + { + ret_value = ecma_raise_syntax_error ("invalid character class"); + return ret_value; + } + + ch = lit_utf8_iterator_read_next (&(parser_ctx_p->iter)); + + if ((ch >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) + || (ch >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)) + { + /* See ECMA-262 v5, 15.10.2.10 (Point 3) */ + ch = (ch % 32); + } } - else if (ch == 'x') + else if (ch == LIT_CHAR_LOWERCASE_X) { - /* FIXME: get unicode char from hex-digits */ - /* ch = ...; */ + lit_code_point_t code_point; + const lit_utf8_byte_t *hex_start = parser_ctx_p->iter.buf_p + parser_ctx_p->iter.buf_pos.offset; + + if (!lit_read_code_point_from_hex (hex_start, 2, &code_point)) + { + ret_value = ecma_raise_syntax_error ("decode error"); + return ret_value; + } + + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 2); + + append_char_class (re_ctx_p, code_point, code_point); } - else if (ch == 'u') + else if (ch == LIT_CHAR_LOWERCASE_U) { - /* FIXME: get unicode char from digits */ - /* ch = ...; */ + lit_code_point_t code_point; + const lit_utf8_byte_t *hex_start = parser_ctx_p->iter.buf_p + parser_ctx_p->iter.buf_pos.offset; + + if (!lit_read_code_point_from_hex (hex_start, 4, &code_point)) + { + ret_value = ecma_raise_syntax_error ("decode error"); + return ret_value; + } + + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 4); + + append_char_class (re_ctx_p, code_point, code_point); } - else if (ch == 'd') + else if (ch == LIT_CHAR_LOWERCASE_D) { - /* append digits from '0' to '9'. */ - append_char_class (re_ctx_p, 0x0030UL, 0x0039UL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END); ch = RE_CHAR_UNDEF; } - else if (ch == 'D') + else if (ch == LIT_CHAR_UPPERCASE_D) { - append_char_class (re_ctx_p, 0x0000UL, 0x002FUL); - append_char_class (re_ctx_p, 0x003AUL, 0xFFFFUL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1); + append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX); ch = RE_CHAR_UNDEF; } - else if (ch == 's') + else if (ch == LIT_CHAR_LOWERCASE_S) { - append_char_class (re_ctx_p, 0x0009UL, 0x000DUL); - append_char_class (re_ctx_p, 0x0020UL, 0x0020UL); - append_char_class (re_ctx_p, 0x00A0UL, 0x00A0UL); - append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); - append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); - append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); - append_char_class (re_ctx_p, 0x2028UL, 0x2029UL); - append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); - append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); - append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); - append_char_class (re_ctx_p, 0xFEFFUL, 0xFEFFUL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_TAB, LIT_CHAR_CR); + append_char_class (re_ctx_p, LIT_CHAR_SP, LIT_CHAR_SP); + append_char_class (re_ctx_p, LIT_CHAR_NBSP, LIT_CHAR_NBSP); + append_char_class (re_ctx_p, 0x1680UL, 0x1680UL); /* Ogham Space Mark */ + append_char_class (re_ctx_p, 0x180EUL, 0x180EUL); /* Mongolian Vowel Separator */ + append_char_class (re_ctx_p, 0x2000UL, 0x200AUL); /* En Quad - Hair Space */ + append_char_class (re_ctx_p, LIT_CHAR_LS, LIT_CHAR_PS); + append_char_class (re_ctx_p, 0x202FUL, 0x202FUL); /* Narrow No-Break Space */ + append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */ + append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */ + append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM); ch = RE_CHAR_UNDEF; } - else if (ch == 'S') + else if (ch == LIT_CHAR_UPPERCASE_S) { - append_char_class (re_ctx_p, 0x0000UL, 0x0008UL); - append_char_class (re_ctx_p, 0x000EUL, 0x001FUL); - append_char_class (re_ctx_p, 0x0021UL, 0x009FUL); - append_char_class (re_ctx_p, 0x00A1UL, 0x167FUL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_TAB - 1); + append_char_class (re_ctx_p, LIT_CHAR_CR + 1, LIT_CHAR_SP - 1); + append_char_class (re_ctx_p, LIT_CHAR_SP + 1, LIT_CHAR_NBSP - 1); + append_char_class (re_ctx_p, LIT_CHAR_NBSP + 1, 0x167FUL); append_char_class (re_ctx_p, 0x1681UL, 0x180DUL); append_char_class (re_ctx_p, 0x180FUL, 0x1FFFUL); - append_char_class (re_ctx_p, 0x200BUL, 0x2027UL); - append_char_class (re_ctx_p, 0x202AUL, 0x202EUL); + append_char_class (re_ctx_p, 0x200BUL, LIT_CHAR_LS - 1); + append_char_class (re_ctx_p, LIT_CHAR_PS + 1, 0x202EUL); append_char_class (re_ctx_p, 0x2030UL, 0x205EUL); append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL); - append_char_class (re_ctx_p, 0x3001UL, 0xFEFEUL); - append_char_class (re_ctx_p, 0xFF00UL, 0xFFFFUL); + append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1); + append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX); ch = RE_CHAR_UNDEF; } - else if (ch == 'w') + else if (ch == LIT_CHAR_LOWERCASE_W) { - append_char_class (re_ctx_p, 0x0030UL, 0x0039UL); - append_char_class (re_ctx_p, 0x0041UL, 0x005AUL); - append_char_class (re_ctx_p, 0x005FUL, 0x005FUL); - append_char_class (re_ctx_p, 0x0061UL, 0x007AUL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_0, LIT_CHAR_9); + append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z); + append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE); + append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z); ch = RE_CHAR_UNDEF; } - else if (ch == 'W') + else if (ch == LIT_CHAR_UPPERCASE_W) { - append_char_class (re_ctx_p, 0x0000UL, 0x002FUL); - append_char_class (re_ctx_p, 0x003AUL, 0x0040UL); - append_char_class (re_ctx_p, 0x005BUL, 0x005EUL); - append_char_class (re_ctx_p, 0x0060UL, 0x0060UL); - append_char_class (re_ctx_p, 0x007BUL, 0xFFFFUL); + /* See ECMA-262 v5, 15.10.2.12 */ + append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_0 - 1); + append_char_class (re_ctx_p, LIT_CHAR_9 + 1, LIT_CHAR_UPPERCASE_A - 1); + append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1); + append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1); + append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX); ch = RE_CHAR_UNDEF; } - else if (lit_char_is_decimal_digit (ch)) + else if (ch <= LIT_UTF16_CODE_UNIT_MAX + && lit_char_is_decimal_digit ((ecma_char_t) ch)) { if (ch != LIT_CHAR_0 - || lit_char_is_decimal_digit (RE_LOOKUP (*pattern_p, 1))) + || lit_char_is_decimal_digit (re_lookup (parser_ctx_p->iter, 1))) { /* FIXME: octal support */ } @@ -485,16 +539,16 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ } } } - while (true); + while (token_type == RE_TOK_START_CHAR_CLASS || token_type == RE_TOK_START_INV_CHAR_CLASS); uint32_t advance = 0; ECMA_TRY_CATCH (empty, - re_parse_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->iter, out_token_p, 0, &advance), ret_value); - RE_ADVANCE (parser_ctx_p->current_char_p, advance); + lit_utf8_iterator_advance (&(parser_ctx_p->iter), advance); ECMA_FINALIZE (empty); return ret_value; @@ -511,33 +565,40 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * re_token_t *out_token_p) /**< output token */ { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); + + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) + { + out_token_p->type = RE_TOK_EOF; + return ret_value; + } + uint32_t advance = 0; - ecma_char_t ch0 = *(parser_ctx_p->current_char_p); + ecma_char_t ch0 = lit_utf8_iterator_peek_next (&(parser_ctx_p->iter)); switch (ch0) { - case '|': + case LIT_CHAR_VLINE: { advance = 1; out_token_p->type = RE_TOK_ALTERNATIVE; break; } - case '^': + case LIT_CHAR_CIRCUMFLEX: { advance = 1; out_token_p->type = RE_TOK_ASSERT_START; break; } - case '$': + case LIT_CHAR_DOLLAR_SIGN: { advance = 1; out_token_p->type = RE_TOK_ASSERT_END; break; } - case '.': + case LIT_CHAR_DOT: { ECMA_TRY_CATCH (empty, - re_parse_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->iter, out_token_p, 1, &advance), @@ -547,118 +608,138 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * ECMA_FINALIZE (empty); break; } - case '\\': + case LIT_CHAR_BACKSLASH: { - advance = 2; out_token_p->type = RE_TOK_CHAR; - ecma_char_t ch1 = RE_LOOKUP (parser_ctx_p->current_char_p, 1); + ecma_char_t ch1 = re_lookup (parser_ctx_p->iter, 1); - if (ch1 == 'b') + if (ch1 == LIT_CHAR_LOWERCASE_B) { + advance = 2; out_token_p->type = RE_TOK_ASSERT_WORD_BOUNDARY; } - else if (ch1 == 'B') + else if (ch1 == LIT_CHAR_UPPERCASE_B) { + advance = 2; out_token_p->type = RE_TOK_ASSERT_NOT_WORD_BOUNDARY; } - else if (ch1 == 'f') + else if (ch1 == LIT_CHAR_LOWERCASE_F) { - out_token_p->value = RE_CONTROL_CHAR_FF; + out_token_p->value = LIT_CHAR_FF; } - else if (ch1 == 'n') + else if (ch1 == LIT_CHAR_LOWERCASE_N) { - out_token_p->value = RE_CONTROL_CHAR_EOL; + out_token_p->value = LIT_CHAR_LF; } - else if (ch1 == 't') + else if (ch1 == LIT_CHAR_LOWERCASE_T) { - out_token_p->value = RE_CONTROL_CHAR_TAB; + out_token_p->value = LIT_CHAR_TAB; } - else if (ch1 == 'r') + else if (ch1 == LIT_CHAR_LOWERCASE_R) { - out_token_p->value = RE_CONTROL_CHAR_CR; + out_token_p->value = LIT_CHAR_CR; } - else if (ch1 == 'v') + else if (ch1 == LIT_CHAR_LOWERCASE_V) { - out_token_p->value = RE_CONTROL_CHAR_VT; + out_token_p->value = LIT_CHAR_VTAB; } - else if (ch1 == 'c') + else if (ch1 == LIT_CHAR_LOWERCASE_C) { - ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 2); - if ((ch2 >= 'A' && ch2 <= 'Z') || (ch2 >= 'a' && ch2 <= 'z')) + if (lit_utf8_iterator_is_eos (&(parser_ctx_p->iter))) + { + ret_value = ecma_raise_syntax_error ("invalid character class"); + return ret_value; + } + + ecma_char_t ch2 = lit_utf8_iterator_read_next (&(parser_ctx_p->iter)); + + if ((ch2 >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && ch2 <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END) + || (ch2 >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && ch2 <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)) { - advance = 3; out_token_p->type = RE_TOK_CHAR; out_token_p->value = (ch2 % 32); } - else + } + else if (ch1 == LIT_CHAR_LOWERCASE_X + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 2)) + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 3))) + { + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 2); + + lit_code_point_t code_point; + const lit_utf8_byte_t *hex_start = parser_ctx_p->iter.buf_p + parser_ctx_p->iter.buf_pos.offset; + + if (!lit_read_code_point_from_hex (hex_start, 2, &code_point)) { - ret_value = ecma_raise_syntax_error ("invalid regexp control escape"); - break; + ret_value = ecma_raise_syntax_error ("decode error"); + return ret_value; } - } - else if (ch1 == 'x' - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)) - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 3))) - { - advance = 4; + + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 2); + out_token_p->type = RE_TOK_CHAR; - /* FIXME: get unicode char from hex-digits */ - /* result.value = ...; */ + out_token_p->value = code_point; } - else if (ch1 == 'u' - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 2)) - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 3)) - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 4)) - && lit_char_is_hex_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 5))) + else if (ch1 == LIT_CHAR_LOWERCASE_U + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 2)) + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 3)) + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 4)) + && lit_char_is_hex_digit (re_lookup (parser_ctx_p->iter, 5))) { - advance = 4; + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 2); + + lit_code_point_t code_point; + const lit_utf8_byte_t *hex_start = parser_ctx_p->iter.buf_p + parser_ctx_p->iter.buf_pos.offset; + + if (!lit_read_code_point_from_hex (hex_start, 4, &code_point)) + { + ret_value = ecma_raise_syntax_error ("decode error"); + return ret_value; + } + + lit_utf8_iterator_advance (&(parser_ctx_p->iter), 4); + out_token_p->type = RE_TOK_CHAR; - /* FIXME: get unicode char from digits */ - /* result.value = ...; */ + out_token_p->value = code_point; } - else if (ch1 == 'd') + else if (ch1 == LIT_CHAR_LOWERCASE_D) { - advance = 2; out_token_p->type = RE_TOK_DIGIT; } - else if (ch1 == 'D') + else if (ch1 == LIT_CHAR_UPPERCASE_D) { - advance = 2; out_token_p->type = RE_TOK_NOT_DIGIT; } - else if (ch1 == 's') + else if (ch1 == LIT_CHAR_LOWERCASE_S) { - advance = 2; out_token_p->type = RE_TOK_WHITE; } - else if (ch1 == 'S') + else if (ch1 == LIT_CHAR_UPPERCASE_S) { - advance = 2; out_token_p->type = RE_TOK_NOT_WHITE; } - else if (ch1 == 'w') + else if (ch1 == LIT_CHAR_LOWERCASE_W) { - advance = 2; out_token_p->type = RE_TOK_WORD_CHAR; } - else if (ch1 == 'W') + else if (ch1 == LIT_CHAR_UPPERCASE_W) { - advance = 2; out_token_p->type = RE_TOK_NOT_WORD_CHAR; } else if (lit_char_is_decimal_digit (ch1)) { - if (ch1 == '0') + if (ch1 == LIT_CHAR_0) { - if (lit_char_is_decimal_digit (RE_LOOKUP (parser_ctx_p->current_char_p, 2))) + if (lit_char_is_decimal_digit (re_lookup (parser_ctx_p->iter, 2))) { ret_value = ecma_raise_syntax_error ("RegExp escape pattern error."); break; } advance = 2; - out_token_p->value = RE_CONTROL_CHAR_NUL; + out_token_p->value = LIT_UNICODE_CODE_POINT_NULL; } else { @@ -671,7 +752,6 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * { uint32_t number = 0; int index = 0; - advance = 0; do { @@ -682,7 +762,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } advance++; - ecma_char_t digit = RE_LOOKUP (parser_ctx_p->current_char_p, + ecma_char_t digit = re_lookup (parser_ctx_p->iter, advance); if (!lit_char_is_decimal_digit (digit)) { @@ -708,12 +788,13 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } else { + advance = 2; out_token_p->value = ch1; } uint32_t iter_adv = 0; ECMA_TRY_CATCH (empty, - re_parse_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->iter, out_token_p, advance, &iter_adv), @@ -722,25 +803,25 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * ECMA_FINALIZE (empty); break; } - case '(': + case LIT_CHAR_LEFT_PAREN: { - if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '?') + if (re_lookup (parser_ctx_p->iter, 1) == LIT_CHAR_QUESTION) { - ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + ecma_char_t ch2 = re_lookup (parser_ctx_p->iter, 2); - if (ch2 == '=') + if (ch2 == LIT_CHAR_EQUALS) { /* (?= */ advance = 3; out_token_p->type = RE_TOK_ASSERT_START_POS_LOOKAHEAD; } - else if (ch2 == '!') + else if (ch2 == LIT_CHAR_EXCLAMATION) { /* (?! */ advance = 3; out_token_p->type = RE_TOK_ASSERT_START_NEG_LOOKAHEAD; } - else if (ch2 == ':') + else if (ch2 == LIT_CHAR_COLON) { /* (?: */ advance = 3; @@ -755,10 +836,10 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } break; } - case ')': + case LIT_CHAR_RIGHT_PAREN: { ECMA_TRY_CATCH (empty, - re_parse_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->iter, out_token_p, 1, &advance), @@ -768,28 +849,28 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * ECMA_FINALIZE (empty); break; } - case '[': + case LIT_CHAR_LEFT_SQUARE: { advance = 1; out_token_p->type = RE_TOK_START_CHAR_CLASS; - if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '^') + if (re_lookup (parser_ctx_p->iter, 1) == LIT_CHAR_CIRCUMFLEX) { advance = 2; out_token_p->type = RE_TOK_START_INV_CHAR_CLASS; } break; } - case ']': - case '}': - case '?': - case '*': - case '+': - case '{': + case LIT_CHAR_RIGHT_SQUARE: + case LIT_CHAR_RIGHT_BRACE: + case LIT_CHAR_QUESTION: + case LIT_CHAR_ASTERISK: + case LIT_CHAR_PLUS: + case LIT_CHAR_LEFT_BRACE: { - JERRY_UNREACHABLE (); - break; + ret_value = ecma_raise_syntax_error ("Invalid RegExp token."); + return ret_value; } - case '\0': + case LIT_CHAR_NULL: { advance = 0; out_token_p->type = RE_TOK_EOF; @@ -798,7 +879,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * default: { ECMA_TRY_CATCH (empty, - re_parse_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->iter, out_token_p, 1, &advance), @@ -813,7 +894,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * if (ecma_is_completion_value_empty (ret_value)) { - RE_ADVANCE (parser_ctx_p->current_char_p, advance); + lit_utf8_iterator_advance (&(parser_ctx_p->iter), advance); } return ret_value; diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h index 2f1f739f6..5f0b38c79 100644 --- a/jerry-core/parser/regexp/re-parser.h +++ b/jerry-core/parser/regexp/re-parser.h @@ -59,16 +59,10 @@ typedef enum */ #define RE_MAX_RE_DECESC_DIGITS 9 -/* FIXME: depends on unicode support */ -#define RE_CHAR_UNDEF ((ecma_char_t)-1) - -#define RE_CONTROL_CHAR_NUL 0x0000 /* \0 */ -#define RE_CONTROL_CHAR_BEL 0x0008 /* \b */ -#define RE_CONTROL_CHAR_TAB 0x0009 /* \t */ -#define RE_CONTROL_CHAR_EOL 0x000a /* \n */ -#define RE_CONTROL_CHAR_VT 0x000b /* \v */ -#define RE_CONTROL_CHAR_FF 0x000c /* \f */ -#define RE_CONTROL_CHAR_CR 0x000d /* \r */ +/** + * Undefined character (out of the range of the codeunit) + */ +#define RE_CHAR_UNDEF 0xFFFFFFFF /** * RegExp token type @@ -87,8 +81,7 @@ typedef struct */ typedef struct { - lit_utf8_byte_t *pattern_start_p; /**< start of input pattern string */ - lit_utf8_byte_t *current_char_p; /**< current character in input pattern */ + lit_utf8_iterator_t iter; /**< iterator of input pattern */ int num_of_groups; /**< number of groups */ uint32_t num_of_classes; /**< number of character classes */ } re_parser_ctx_t; diff --git a/tests/jerry/regexp-character-class.js b/tests/jerry/regexp-character-class.js index aaa744dee..b58d2e742 100644 --- a/tests/jerry/regexp-character-class.js +++ b/tests/jerry/regexp-character-class.js @@ -30,4 +30,87 @@ assert (r == "abcdefghjklmnopqrstuvwxyz"); r = new RegExp ("[A-Z]*").exec("abcdefghjklmnopqrstuvwxyz"); assert (r == ""); -// FIXME: Add more tescase when Unicode support is finished! +r = new RegExp ("[^a-z]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[^A-Z]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("\\d*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("\\D*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("\\w*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("\\W*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("\\s*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("\\S*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[\\d]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[\\D]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[\\w]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[\\W]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[\\s]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[\\S]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[^\\d]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[^\\D]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[^\\w]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("[^\\W]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[^\\s]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == "abcdefghjklmnopqrstuvwxyz"); + +r = new RegExp ("[^\\S]*").exec("abcdefghjklmnopqrstuvwxyz"); +assert (r == ""); + +r = new RegExp ("\\d*").exec("0123456789"); +assert (r == "0123456789"); + +try +{ + r = new RegExp("["); + assert (false); +} +catch (e) +{ + assert (e instanceof SyntaxError); + assert (e.message === "invalid character class"); +} + +try +{ + r = new RegExp("[\\"); + assert (false); +} +catch (e) +{ + assert (e instanceof SyntaxError); + assert (e.message === "invalid character class"); +}