From cf9d54545f57bd95353a2ed6dd7ac102ae6b347a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Lang=C3=B3?= Date: Fri, 10 Jul 2015 15:57:06 +0200 Subject: [PATCH] Style fixes for RegExp engine. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com --- .../ecma-builtin-regexp-prototype.inc.h | 1 + .../builtin-objects/ecma-builtin-regexp.inc.h | 1 + .../ecma/operations/ecma-regexp-object.cpp | 103 ++++-- .../ecma/operations/ecma-regexp-object.h | 32 +- jerry-core/parser/regexp/re-compiler.cpp | 324 +++++++++--------- jerry-core/parser/regexp/re-compiler.h | 108 +++--- jerry-core/parser/regexp/re-parser.cpp | 44 ++- jerry-core/parser/regexp/re-parser.h | 84 +++-- 8 files changed, 385 insertions(+), 312 deletions(-) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h index 1369eb156..623a4232e 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp-prototype.inc.h @@ -33,6 +33,7 @@ /* Object identifier */ OBJECT_ID (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE) +// ECMA-262 v5, 15.10.6.1 OBJECT_VALUE (LIT_MAGIC_STRING_CONSTRUCTOR, ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP), ECMA_PROPERTY_WRITABLE, diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.inc.h b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.inc.h index 9e9088685..0aa366c46 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.inc.h +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-regexp.inc.h @@ -82,6 +82,7 @@ NUMBER_VALUE (LIT_MAGIC_STRING_LASTINDEX_UL, ECMA_PROPERTY_NOT_ENUMERABLE, ECMA_PROPERTY_NOT_CONFIGURABLE) +// ECMA-262 v5, 15.10.5 NUMBER_VALUE (LIT_MAGIC_STRING_LENGTH, 2, ECMA_PROPERTY_NOT_WRITABLE, diff --git a/jerry-core/ecma/operations/ecma-regexp-object.cpp b/jerry-core/ecma/operations/ecma-regexp-object.cpp index 5500094e4..e81312ddb 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.cpp +++ b/jerry-core/ecma/operations/ecma-regexp-object.cpp @@ -38,7 +38,7 @@ * @{ */ -/* +/** * RegExp results are stored in an array of string pointers. If N is the number * of groups then the length of the array is 2*N, because every group has a start * and end. We have to handle those pointers. @@ -54,7 +54,9 @@ #define RE_GLOBAL_START_IDX 0 #define RE_GLOBAL_END_IDX 1 -/* RegExp flags */ +/** + * RegExp flags + */ #define RE_FLAG_GLOBAL (1 << 0) /* ECMA-262 v5, 15.10.7.2 */ #define RE_FLAG_IGNORE_CASE (1 << 1) /* ECMA-262 v5, 15.10.7.3 */ #define RE_FLAG_MULTILINE (1 << 2) /* ECMA-262 v5, 15.10.7.4 */ @@ -78,8 +80,8 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags MEM_DEFINE_LOCAL_ARRAY (flags_start_p, flags_str_size, lit_utf8_byte_t); ecma_string_to_utf8_string (flags_str_p, flags_start_p, (ssize_t) flags_str_size); - lit_utf8_byte_t *flags_char_p = flags_start_p; + while (flags_char_p < flags_start_p + flags_str_size && ecma_is_completion_value_empty (ret_value)) { @@ -140,8 +142,8 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ { JERRY_ASSERT (pattern_p != NULL); ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - uint8_t flags = 0; + if (flags_str_p != NULL) { ECMA_TRY_CATCH (empty, re_parse_regexp_flags (flags_str_p, &flags), ret_value); @@ -233,43 +235,43 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */ * Backtrack a unicode character */ static const lit_utf8_byte_t * -utf8_backtrack (const lit_utf8_byte_t *str_p) +re_utf8_backtrack (const lit_utf8_byte_t *str_p) { /* FIXME: change to string iterator with unicode support, when it would be implemented */ return --str_p; -} /* utf8_backtrack */ +} /* re_utf8_backtrack */ /** * Helper to get an input character and increase string pointer. */ static ecma_char_t -get_input_char (const lit_utf8_byte_t **char_p) +re_get_input_char (const lit_utf8_byte_t **char_p) { /* FIXME: change to string iterator with unicode support, when it would be implemented */ const lit_utf8_byte_t ch = **char_p; (*char_p)++; return ch; -} /* get_input_char */ +} /* re_get_input_char */ /** * Helper to get current input character, won't increase string pointer. */ static ecma_char_t -lookup_input_char (const lit_utf8_byte_t *str_p) +re_lookup_input_char (const lit_utf8_byte_t *str_p) { /* FIXME: change to string iterator with unicode support, when it would be implemented */ return *str_p; -} /* lookup_input_char */ +} /* re_lookup_input_char */ /** * Helper to get previous input character, won't decrease string pointer. */ static ecma_char_t -lookup_prev_char (const lit_utf8_byte_t *str_p) +re_lookup_prev_char (const lit_utf8_byte_t *str_p) { /* FIXME: change to string iterator with unicode support, when it would be implemented */ return *(--str_p); -} /* lookup_prev_char */ +} /* re_lookup_prev_char */ /** * Recursive function for RegExp matching. Tests for a regular expression @@ -319,7 +321,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_CHAR: { uint32_t ch1 = re_get_value (&bc_p); - uint32_t ch2 = get_input_char (&str_p); + uint32_t ch2 = re_get_input_char (&str_p); JERRY_DDLOG ("Character matching %d to %d: ", ch1, ch2); if (ch2 == '\0' || ch1 != ch2) @@ -333,8 +335,9 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } case RE_OP_PERIOD: { - uint32_t ch1 = get_input_char (&str_p); + uint32_t ch1 = re_get_input_char (&str_p); JERRY_DDLOG ("Period matching '.' to %d: ", ch1); + if (ch1 == '\n' || ch1 == '\0') { JERRY_DDLOG ("fail\n"); @@ -361,7 +364,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - if (lit_char_is_line_terminator (lookup_prev_char (str_p))) + if (lit_char_is_line_terminator (re_lookup_prev_char (str_p))) { JERRY_DDLOG ("match\n"); break; @@ -388,7 +391,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - if (lit_char_is_line_terminator (lookup_input_char (str_p))) + if (lit_char_is_line_terminator (re_lookup_input_char (str_p))) { JERRY_DDLOG ("match\n"); break; /* tail merge */ @@ -409,7 +412,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } else { - is_wordchar_left = lit_char_is_word_char (lookup_prev_char (str_p)); + is_wordchar_left = lit_char_is_word_char (re_lookup_prev_char (str_p)); } if (str_p >= re_ctx_p->input_end_p) @@ -418,7 +421,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } else { - is_wordchar_right = lit_char_is_word_char (lookup_input_char (str_p)); + is_wordchar_right = lit_char_is_word_char (re_lookup_input_char (str_p)); } if (op == RE_OP_ASSERT_WORD_BOUNDARY) @@ -460,6 +463,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ do { uint32_t offset = re_get_value (&bc_p); + if (!sub_str_p) { match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); @@ -492,6 +496,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (!ecma_is_completion_value_throw (match_value)) { re_ctx_p->recursion_depth--; + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -522,10 +527,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - curr_ch = get_input_char (&str_p); + curr_ch = re_get_input_char (&str_p); num_of_ranges = re_get_value (&bc_p); is_match = false; + while (num_of_ranges) { uint32_t ch1, ch2; @@ -581,6 +587,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } sub_str_p = re_ctx_p->saved_p[backref_idx]; + while (sub_str_p < re_ctx_p->saved_p[backref_idx + 1]) { uint32_t ch1, ch2; @@ -592,8 +599,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - ch1 = get_input_char (&sub_str_p); - ch2 = get_input_char (&str_p); + ch1 = re_get_input_char (&sub_str_p); + ch2 = re_get_input_char (&str_p); if (ch1 != ch2) { @@ -613,6 +620,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ JERRY_DDLOG ("Execute RE_OP_SAVE_AT_START\n"); old_start_p = re_ctx_p->saved_p[RE_GLOBAL_START_IDX]; re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = str_p; + do { uint32_t offset = re_get_value (&bc_p); @@ -654,6 +662,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ uint32_t offset = re_get_value (&bc_p); JERRY_DDLOG ("Execute RE_OP_ALTERNATIVE"); bc_p += offset; + while (*bc_p == RE_OP_ALTERNATIVE) { JERRY_DDLOG (", jump: %d"); @@ -661,6 +670,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ offset = re_get_value (&bc_p); bc_p += offset; } + JERRY_DDLOG ("\n"); break; /* tail merge */ } @@ -695,13 +705,14 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ iter_idx = start_idx + (re_ctx_p->num_of_captures / 2) - 1; start_idx += re_ctx_p->num_of_captures; } - re_ctx_p->num_of_iterations[iter_idx] = 0; + re_ctx_p->num_of_iterations_p[iter_idx] = 0; /* Jump all over to the end of the END opcode. */ bc_p += offset; /* Try to match after the close paren if zero is allowed */ ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -730,8 +741,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ const lit_utf8_byte_t *sub_str_p; re_bytecode_t *old_bc_p; re_bytecode_t *end_bc_p = NULL; - start_idx = re_get_value (&bc_p); + if (op != RE_OP_CAPTURE_GROUP_START && op != RE_OP_NON_CAPTURE_GROUP_START) { @@ -752,9 +763,9 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ start_idx += re_ctx_p->num_of_captures; } old_start_p = re_ctx_p->saved_p[start_idx]; - old_iteration_cnt = re_ctx_p->num_of_iterations[iter_idx]; + old_iteration_cnt = re_ctx_p->num_of_iterations_p[iter_idx]; re_ctx_p->saved_p[start_idx] = str_p; - re_ctx_p->num_of_iterations[iter_idx] = 0; + re_ctx_p->num_of_iterations_p[iter_idx] = 0; do { @@ -775,7 +786,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE); bc_p = old_bc_p; - re_ctx_p->num_of_iterations[iter_idx] = old_iteration_cnt; + re_ctx_p->num_of_iterations_p[iter_idx] = old_iteration_cnt; /* Try to match after the close paren if zero is allowed. */ if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START @@ -783,6 +794,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { JERRY_ASSERT (end_bc_p); ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, end_bc_p, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -829,9 +841,10 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ end_idx += re_ctx_p->num_of_captures; } - re_ctx_p->num_of_iterations[iter_idx]++; - if (re_ctx_p->num_of_iterations[iter_idx] >= min - && re_ctx_p->num_of_iterations[iter_idx] <= max) + re_ctx_p->num_of_iterations_p[iter_idx]++; + + if (re_ctx_p->num_of_iterations_p[iter_idx] >= min + && re_ctx_p->num_of_iterations_p[iter_idx] <= max) { old_end_p = re_ctx_p->saved_p[end_idx]; re_ctx_p->saved_p[end_idx] = str_p; @@ -851,7 +864,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ re_ctx_p->saved_p[end_idx] = old_end_p; } - re_ctx_p->num_of_iterations[iter_idx]--; + re_ctx_p->num_of_iterations_p[iter_idx]--; bc_p = old_bc_p; /* If non-greedy fails and try to iterate... */ @@ -887,19 +900,20 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } /* Check the empty iteration if the minimum number of iterations is reached. */ - if (re_ctx_p->num_of_iterations[iter_idx] >= min + if (re_ctx_p->num_of_iterations_p[iter_idx] >= min && str_p == re_ctx_p->saved_p[start_idx]) { re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } - re_ctx_p->num_of_iterations[iter_idx]++; + + re_ctx_p->num_of_iterations_p[iter_idx]++; old_bc_p = bc_p; /* Save the bytecode end position of the END opcodes for matching after it. */ old_end_p = re_ctx_p->saved_p[end_idx]; re_ctx_p->saved_p[end_idx] = str_p; - if (re_ctx_p->num_of_iterations[iter_idx] < max) + if (re_ctx_p->num_of_iterations_p[iter_idx] < max) { bc_p -= offset; offset = re_get_value (&bc_p); @@ -907,6 +921,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ old_start_p = re_ctx_p->saved_p[start_idx]; re_ctx_p->saved_p[start_idx] = str_p; ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -931,6 +946,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ re_ctx_p->saved_p[start_idx] = str_p; ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -947,11 +963,12 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } } - if (re_ctx_p->num_of_iterations[iter_idx] >= min - && re_ctx_p->num_of_iterations[iter_idx] <= max) + if (re_ctx_p->num_of_iterations_p[iter_idx] >= min + && re_ctx_p->num_of_iterations_p[iter_idx] <= max) { /* Try to match the rest of the bytecode. */ ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, old_bc_p, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -966,7 +983,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ /* restore if fails */ re_ctx_p->saved_p[end_idx] = old_end_p; - re_ctx_p->num_of_iterations[iter_idx]--; + re_ctx_p->num_of_iterations_p[iter_idx]--; re_ctx_p->recursion_depth--; return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */ } @@ -988,6 +1005,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ if (num_of_iter >= min) { ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -1001,6 +1019,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ } ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); + if (!ecma_is_value_true (match_value)) { if (ecma_is_completion_value_throw (match_value)) @@ -1029,6 +1048,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ (unsigned long) min, (unsigned long) max, (long) offset); num_of_iter = 0; + while (num_of_iter < max) { ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p); @@ -1048,6 +1068,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ while (num_of_iter >= min) { ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p); + if (ecma_is_value_true (match_value)) { *res_p = sub_str_p; @@ -1058,12 +1079,13 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */ { return match_value; } + if (num_of_iter == min) { break; } - str_p = utf8_backtrack (str_p); + str_p = re_utf8_backtrack (str_p); num_of_iter--; } re_ctx_p->recursion_depth--; @@ -1206,6 +1228,7 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ re_ctx.num_of_non_captures = re_get_value (&bc_p); MEM_DEFINE_LOCAL_ARRAY (saved_p, re_ctx.num_of_captures + re_ctx.num_of_non_captures, const lit_utf8_byte_t *); + for (uint32_t i = 0; i < re_ctx.num_of_captures + re_ctx.num_of_non_captures; i++) { saved_p[i] = NULL; @@ -1214,13 +1237,14 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ uint32_t num_of_iter_length = (re_ctx.num_of_captures / 2) + (re_ctx.num_of_non_captures - 1); MEM_DEFINE_LOCAL_ARRAY (num_of_iter_p, num_of_iter_length, uint32_t); + for (uint32_t i = 0; i < num_of_iter_length; i++) { num_of_iter_p[i] = 0u; } bool is_match = false; - re_ctx.num_of_iterations = num_of_iter_p; + re_ctx.num_of_iterations_p = num_of_iter_p; int32_t index = 0; if (re_ctx.flags & RE_FLAG_GLOBAL) @@ -1236,6 +1260,7 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ /* 2. Try to match */ const lit_utf8_byte_t *sub_str_p; + while (str_p && str_p <= re_ctx.input_end_p && ecma_is_completion_value_empty (ret_value)) { if (index < 0 || index > (int32_t) input_size) @@ -1254,11 +1279,13 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */ { sub_str_p = NULL; ECMA_TRY_CATCH (match_value, re_match_regexp (&re_ctx, bc_p, str_p, &sub_str_p), ret_value); + if (ecma_is_value_true (match_value)) { is_match = true; break; } + str_p++; index++; ECMA_FINALIZE (match_value); diff --git a/jerry-core/ecma/operations/ecma-regexp-object.h b/jerry-core/ecma/operations/ecma-regexp-object.h index 422a265a6..3a0eec29b 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.h +++ b/jerry-core/ecma/operations/ecma-regexp-object.h @@ -29,26 +29,30 @@ * @{ */ -#define RE_EXECUTE_RECURSION_LIMIT 1000 /* Limit of RegExp executor recursion depth */ -#define RE_EXECUTE_MATCH_LIMIT 10000 /* Limit of RegExp execetur matching steps */ +/** + * Limit of RegExp executor recursion depth + */ +#define RE_EXECUTE_RECURSION_LIMIT 1000 + +/** + * Limit of RegExp execetur matching steps + */ +#define RE_EXECUTE_MATCH_LIMIT 10000 /** * RegExp executor context - * - * FIXME: - * Add comments with description of the structure members */ typedef struct { - const lit_utf8_byte_t **saved_p; - const lit_utf8_byte_t *input_start_p; - const lit_utf8_byte_t *input_end_p; - uint32_t match_limit; - uint32_t recursion_depth; - uint32_t num_of_captures; - uint32_t num_of_non_captures; - uint32_t *num_of_iterations; - uint8_t flags; + const lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */ + const lit_utf8_byte_t *input_start_p; /**< start of input pattern string */ + const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */ + uint32_t match_limit; /**< matching limit counter */ + uint32_t recursion_depth; /**< recursion depth counter */ + uint32_t num_of_captures; /**< number of capture groups */ + uint32_t num_of_non_captures; /**< number of non-capture groups */ + uint32_t *num_of_iterations_p; /**< number of iterations */ + uint8_t flags; /**< RegExp flags */ } re_matcher_ctx_t; extern ecma_completion_value_t diff --git a/jerry-core/parser/regexp/re-compiler.cpp b/jerry-core/parser/regexp/re-compiler.cpp index 2cbf0f5c6..92efca5c2 100644 --- a/jerry-core/parser/regexp/re-compiler.cpp +++ b/jerry-core/parser/regexp/re-compiler.cpp @@ -25,26 +25,29 @@ #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN /** - * FIXME: - * Add comments to macro definitions in the component + * Size of block of RegExp bytecode. Used for allocation */ - #define REGEXP_BYTECODE_BLOCK_SIZE 256UL -#define BYTECODE_LEN(bc_ctx_p) ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p)) - -void -regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx); /** - * FIXME: - * Add missing 're' prefixes to the component's external and internal interfaces + * Get length of bytecode */ +static uint32_t +re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p) +{ + return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p)); +} /* re_get_bytecode_length */ + +void +re_dump_bytecode (re_bytecode_ctx_t *bc_ctx); /** * Realloc the bytecode container + * + * @return current position in RegExp bytecode */ static re_bytecode_t* -realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ +re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */ { JERRY_ASSERT (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p >= 0); size_t old_size = static_cast (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p); @@ -70,112 +73,113 @@ realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode bc_ctx_p->current_p = new_block_start_p + current_ptr_offset; return bc_ctx_p->current_p; -} /* realloc_regexp_bytecode_block */ +} /* re_realloc_regexp_bytecode_block */ /** * Append a new bytecode to the and of the bytecode container */ static void -bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - re_bytecode_t *bytecode_p, /**< input bytecode */ - size_t length) /**< length of input */ +re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + re_bytecode_t *bytecode_p, /**< input bytecode */ + size_t length) /**< length of input */ { JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); re_bytecode_t *current_p = bc_ctx_p->current_p; if (current_p + length > bc_ctx_p->block_end_p) { - current_p = realloc_regexp_bytecode_block (bc_ctx_p); + current_p = re_realloc_regexp_bytecode_block (bc_ctx_p); } memcpy (current_p, bytecode_p, length); bc_ctx_p->current_p += length; -} /* bytecode_list_append */ +} /* re_bytecode_list_append */ /** * Insert a new bytecode to the bytecode container */ static void -bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - size_t offset, /**< distance from the start of the container */ - re_bytecode_t *bytecode_p, /**< input bytecode */ - size_t length) /**< length of input */ +re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + size_t offset, /**< distance from the start of the container */ + re_bytecode_t *bytecode_p, /**< input bytecode */ + size_t length) /**< length of input */ { JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE); re_bytecode_t *current_p = bc_ctx_p->current_p; if (current_p + length > bc_ctx_p->block_end_p) { - realloc_regexp_bytecode_block (bc_ctx_p); + re_realloc_regexp_bytecode_block (bc_ctx_p); } re_bytecode_t *src_p = bc_ctx_p->block_start_p + offset; - if ((BYTECODE_LEN (bc_ctx_p) - offset) > 0) + if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0) { re_bytecode_t *dest_p = src_p + length; - re_bytecode_t *tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((BYTECODE_LEN (bc_ctx_p) - offset), - MEM_HEAP_ALLOC_SHORT_TERM); - memcpy (tmp_block_start_p, src_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset)); - memcpy (dest_p, tmp_block_start_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset)); + re_bytecode_t *tmp_block_start_p; + tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((re_get_bytecode_length (bc_ctx_p) - offset), + MEM_HEAP_ALLOC_SHORT_TERM); + memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); + memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset)); mem_heap_free_block (tmp_block_start_p); } memcpy (src_p, bytecode_p, length); bc_ctx_p->current_p += length; -} /* bytecode_list_insert */ +} /* re_bytecode_list_insert */ /** * Append a RegExp opcode */ static void -append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - re_opcode_t opcode) /**< input opcode */ +re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + re_opcode_t opcode) /**< input opcode */ { - bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); -} /* append_opcode */ + re_bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); +} /* re_append_opcode */ /** * Append a parameter of a RegExp opcode */ static void -append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t value) /**< input value */ +re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t value) /**< input value */ { - bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t)); -} /* append_u32 */ + re_bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t)); +} /* re_append_u32 */ /** * Append a jump offset parameter of a RegExp opcode */ static void -append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t value) /**< input value */ +re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t value) /**< input value */ { value += (uint32_t) (sizeof (uint32_t)); - append_u32 (bc_ctx_p, value); -} /* append_jump_offset */ + re_append_u32 (bc_ctx_p, value); +} /* re_append_jump_offset */ /** * Insert a RegExp opcode */ static void -insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t offset, /**< distance from the start of the container */ - re_opcode_t opcode) /**< input opcode */ +re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t offset, /**< distance from the start of the container */ + re_opcode_t opcode) /**< input opcode */ { - bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); -} /* insert_opcode */ + re_bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t)); +} /* re_insert_opcode */ /** * Insert a parameter of a RegExp opcode */ static void -insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ - uint32_t offset, /**< distance from the start of the container */ - uint32_t value) /**< input value */ +re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */ + uint32_t offset, /**< distance from the start of the container */ + uint32_t value) /**< input value */ { - bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t)); -} /* insert_u32 */ + re_bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t)); +} /* re_insert_u32 */ /** * Get a RegExp opcode @@ -203,23 +207,23 @@ re_get_value (re_bytecode_t **bc_p) /**< pointer to bytecode start */ * Callback function of character class generation */ static void -append_char_class (void* re_ctx_p, /**< RegExp compiler context */ - uint32_t start, /**< character class range from */ - uint32_t end) /**< character class range to */ +re_append_char_class (void* re_ctx_p, /**< RegExp compiler context */ + uint32_t start, /**< character class range from */ + uint32_t end) /**< character class range to */ { /* FIXME: Handle ignore case flag and add unicode support. */ re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t*) re_ctx_p; - append_u32 (ctx_p->bytecode_ctx_p, start); - append_u32 (ctx_p->bytecode_ctx_p, end); + re_append_u32 (ctx_p->bytecode_ctx_p, start); + re_append_u32 (ctx_p->bytecode_ctx_p, end); ctx_p->parser_ctx_p->num_of_classes++; -} /* append_char_class */ +} /* re_append_char_class */ /** * Insert simple atom iterator */ static void -insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t new_atom_start_offset) /**< atom start offset */ +re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t new_atom_start_offset) /**< atom start offset */ { uint32_t atom_code_length; uint32_t offset; @@ -231,30 +235,30 @@ insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler contex /* FIXME: optimize bytecode length. Store 0 rather than INF */ - append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */ - uint32_t bytecode_length = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */ + uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset); offset = new_atom_start_offset; - insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length); - insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax); - insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin); + re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length); + re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax); + re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin); if (re_ctx_p->current_token.greedy) { - insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR); + re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR); } else { - insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR); + re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR); } -} /* insert_simple_iterator */ +} /* re_insert_simple_iterator */ /** * Get the type of a group start */ static re_opcode_t -get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool is_capturable) /**< is capturabel group */ +re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool is_capturable) /**< is capturable group */ { if (is_capturable) { @@ -282,17 +286,14 @@ get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context } return RE_OP_NON_CAPTURE_GROUP_START; - - JERRY_UNREACHABLE (); - return 0; -} /* get_start_opcode_type */ +} /* re_get_start_opcode_type */ /** * Get the type of a group end */ static re_opcode_t -get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool is_capturable) /**< is capturabel group */ +re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool is_capturable) /**< is capturable group */ { if (is_capturable) { @@ -310,64 +311,61 @@ get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context * } return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END; - - JERRY_UNREACHABLE (); - return 0; -} /* get_end_opcode_type */ +} /* re_get_end_opcode_type */ /** * Enclose the given bytecode to a group */ static void -insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t group_start_offset, /**< offset of group start */ - uint32_t idx, /**< index of group */ - bool is_capturable) /**< is capturabel group */ +re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t group_start_offset, /**< offset of group start */ + uint32_t idx, /**< index of group */ + bool is_capturable) /**< is capturable group */ { uint32_t qmin, qmax; - re_opcode_t start_opcode = get_start_opcode_type (re_ctx_p, is_capturable); - re_opcode_t end_opcode = get_end_opcode_type (re_ctx_p, is_capturable); + re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable); + re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable); uint32_t start_head_offset_len; qmin = re_ctx_p->current_token.qmin; qmax = re_ctx_p->current_token.qmax; JERRY_ASSERT (qmin <= qmax); - start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); - insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx); - insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode); - start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - start_head_offset_len; - append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode); - append_u32 (re_ctx_p->bytecode_ctx_p, idx); - append_u32 (re_ctx_p->bytecode_ctx_p, qmin); - append_u32 (re_ctx_p->bytecode_ctx_p, qmax); + start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); + re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx); + re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode); + start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len; + re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode); + re_append_u32 (re_ctx_p->bytecode_ctx_p, idx); + re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin); + re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax); group_start_offset += start_head_offset_len; - append_jump_offset (re_ctx_p->bytecode_ctx_p, - BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); + re_append_jump_offset (re_ctx_p->bytecode_ctx_p, + re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START) { - insert_u32 (re_ctx_p->bytecode_ctx_p, - group_start_offset, - BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); + re_insert_u32 (re_ctx_p->bytecode_ctx_p, + group_start_offset, + re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); } -} /* insert_into_group */ +} /* re_insert_into_group */ /** * Enclose the given bytecode to a group and inster jump value */ static void -insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - uint32_t group_start_offset, /**< offset of group start */ - uint32_t idx, /**< index of group */ - bool is_capturable) /**< is capturabel group */ +re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + uint32_t group_start_offset, /**< offset of group start */ + uint32_t idx, /**< index of group */ + bool is_capturable) /**< is capturable group */ { - insert_u32 (re_ctx_p->bytecode_ctx_p, - group_start_offset, - BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset); - insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); -} /* insert_into_group_with_jump */ + re_insert_u32 (re_ctx_p->bytecode_ctx_p, + group_start_offset, + re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset); + re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable); +} /* re_insert_into_group_with_jump */ /** * Parse alternatives @@ -376,14 +374,14 @@ insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler c * Returned value must be freed with ecma_free_completion_value */ static ecma_completion_value_t -parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ - bool expect_eof) /**< expect end of file */ +re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ + bool expect_eof) /**< expect end of file */ { uint32_t idx; re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p; ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); - uint32_t alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + uint32_t alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); if (re_ctx_p->recursion_depth >= RE_COMPILE_RECURSION_LIMIT) { @@ -399,11 +397,12 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ &(re_ctx_p->current_token)), ret_value); ECMA_FINALIZE (empty); + if (!ecma_is_completion_value_empty (ret_value)) { return ret_value; /* error */ } - uint32_t new_atom_start_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); switch (re_ctx_p->current_token.type) { @@ -412,10 +411,11 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ idx = re_ctx_p->num_of_captures++; JERRY_DDLOG ("Compile a capture group start (idx: %d)\n", idx); - ret_value = parse_alternative (re_ctx_p, false); + ret_value = re_parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) { - insert_into_group (re_ctx_p, new_atom_start_offset, idx, true); + re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true); } else { @@ -428,10 +428,11 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ idx = re_ctx_p->num_of_non_captures++; JERRY_DDLOG ("Compile a non-capture group start (idx: %d)\n", idx); - ret_value = parse_alternative (re_ctx_p, false); + ret_value = re_parse_alternative (re_ctx_p, false); + if (ecma_is_completion_value_empty (ret_value)) { - insert_into_group (re_ctx_p, new_atom_start_offset, idx, false); + re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false); } else { @@ -444,70 +445,71 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ JERRY_DDLOG ("Compile character token: %c, qmin: %d, qmax: %d\n", re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax); - append_opcode (bc_ctx_p, RE_OP_CHAR); - append_u32 (bc_ctx_p, re_ctx_p->current_token.value); + re_append_opcode (bc_ctx_p, RE_OP_CHAR); + re_append_u32 (bc_ctx_p, re_ctx_p->current_token.value); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { - insert_simple_iterator (re_ctx_p, new_atom_start_offset); + re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } break; } case RE_TOK_PERIOD: { JERRY_DDLOG ("Compile a period\n"); - append_opcode (bc_ctx_p, RE_OP_PERIOD); + re_append_opcode (bc_ctx_p, RE_OP_PERIOD); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { - insert_simple_iterator (re_ctx_p, new_atom_start_offset); + re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } break; } case RE_TOK_ALTERNATIVE: { JERRY_DDLOG ("Compile an alternative\n"); - insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); - append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); - alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); + re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE); + alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); break; } case RE_TOK_ASSERT_START: { JERRY_DDLOG ("Compile a start assertion\n"); - append_opcode (bc_ctx_p, RE_OP_ASSERT_START); + re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START); break; } case RE_TOK_ASSERT_END: { JERRY_DDLOG ("Compile an end assertion\n"); - append_opcode (bc_ctx_p, RE_OP_ASSERT_END); + re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END); break; } case RE_TOK_ASSERT_WORD_BOUNDARY: { JERRY_DDLOG ("Compile a word boundary assertion\n"); - append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); + re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY); break; } case RE_TOK_ASSERT_NOT_WORD_BOUNDARY: { JERRY_DDLOG ("Compile a not word boundary assertion\n"); - append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); + re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY); break; } case RE_TOK_ASSERT_START_POS_LOOKAHEAD: { JERRY_DDLOG ("Compile a positive lookahead assertion\n"); idx = re_ctx_p->num_of_non_captures++; - append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); + re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS); + + ret_value = re_parse_alternative (re_ctx_p, false); - ret_value = parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { - append_opcode (bc_ctx_p, RE_OP_MATCH); + re_append_opcode (bc_ctx_p, RE_OP_MATCH); - insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); } else { @@ -519,14 +521,15 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ { JERRY_DDLOG ("Compile a negative lookahead assertion\n"); idx = re_ctx_p->num_of_non_captures++; - append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); + re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG); + + ret_value = re_parse_alternative (re_ctx_p, false); - ret_value = parse_alternative (re_ctx_p, false); if (ecma_is_completion_value_empty (ret_value)) { - append_opcode (bc_ctx_p, RE_OP_MATCH); + re_append_opcode (bc_ctx_p, RE_OP_MATCH); - insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); } else { @@ -538,39 +541,42 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ { uint32_t backref = (uint32_t) re_ctx_p->current_token.value; idx = re_ctx_p->num_of_non_captures++; + if (backref > re_ctx_p->highest_backref) { re_ctx_p->highest_backref = backref; } - JERRY_DDLOG ("Compile a backreference: %d\n", backref); - append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE); - append_u32 (bc_ctx_p, backref); - insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); + JERRY_DDLOG ("Compile a backreference: %d\n", backref); + re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE); + re_append_u32 (bc_ctx_p, backref); + + re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false); break; } case RE_TOK_START_CHAR_CLASS: case RE_TOK_START_INV_CHAR_CLASS: { JERRY_DDLOG ("Compile a character class\n"); - append_opcode (bc_ctx_p, - re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS - ? RE_OP_CHAR_CLASS - : RE_OP_INV_CHAR_CLASS); - uint32_t offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p); + re_append_opcode (bc_ctx_p, + re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS + ? RE_OP_CHAR_CLASS + : RE_OP_INV_CHAR_CLASS); + uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p); ECMA_TRY_CATCH (empty, re_parse_char_class (re_ctx_p->parser_ctx_p, - append_char_class, + re_append_char_class, re_ctx_p, &(re_ctx_p->current_token)), ret_value); - insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes); + re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes); if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1)) { - insert_simple_iterator (re_ctx_p, new_atom_start_offset); + re_insert_simple_iterator (re_ctx_p, new_atom_start_offset); } + ECMA_FINALIZE (empty); break; } @@ -584,7 +590,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ } else { - insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); + re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); re_ctx_p->recursion_depth--; } @@ -598,7 +604,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ } else { - insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset); + re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset); re_ctx_p->recursion_depth--; } @@ -614,7 +620,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */ JERRY_UNREACHABLE (); return ret_value; -} /* parse_alternative */ +} /* re_parse_alternative */ /** * Compilation of RegExp bytecode @@ -656,9 +662,9 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ /* 1. Parse RegExp pattern */ re_ctx.num_of_captures = 1; - append_opcode (&bc_ctx, RE_OP_SAVE_AT_START); + re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START); - ECMA_TRY_CATCH (empty, parse_alternative (&re_ctx, true), ret_value); + ECMA_TRY_CATCH (empty, re_parse_alternative (&re_ctx, true), ret_value); /* 2. Check for invalid backreference */ if (re_ctx.highest_backref >= re_ctx.num_of_captures) @@ -667,13 +673,13 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ } else { - append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH); - append_opcode (&bc_ctx, RE_OP_EOF); + re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH); + re_append_opcode (&bc_ctx, RE_OP_EOF); /* 3. Insert extra informations for bytecode header */ - insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures); - insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2); - insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags); + re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures); + re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2); + re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags); } ECMA_FINALIZE (empty); @@ -684,7 +690,7 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p); #ifdef JERRY_ENABLE_LOG - regexp_dump_bytecode (&bc_ctx); + re_dump_bytecode (&bc_ctx); #endif return ret_value; @@ -695,7 +701,7 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */ * RegExp bytecode dumper */ void -regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) +re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) { re_bytecode_t *bytecode_p = bc_ctx_p->block_start_p; JERRY_DLOG ("%d ", re_get_value (&bytecode_p)); @@ -889,7 +895,7 @@ regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p) } } JERRY_DLOG ("EOF\n"); -} /* regexp_dump_bytecode */ +} /* re_dump_bytecode */ #endif /* JERRY_ENABLE_LOG */ #endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */ diff --git a/jerry-core/parser/regexp/re-compiler.h b/jerry-core/parser/regexp/re-compiler.h index 73e4eedab..addd91329 100644 --- a/jerry-core/parser/regexp/re-compiler.h +++ b/jerry-core/parser/regexp/re-compiler.h @@ -22,77 +22,83 @@ #include "ecma-globals.h" #include "re-parser.h" -/* RegExp opcodes - * Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it. - * Change it carfully. Capture opcodes should be at first. +/** + * RegExp opcodes */ -#define RE_OP_EOF 0 +typedef enum +{ + RE_OP_EOF, + /* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it. + * Change it carefully. Capture opcodes should be at first. + */ + RE_OP_CAPTURE_GROUP_START, + RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START, + RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START, + RE_OP_CAPTURE_GREEDY_GROUP_END, + RE_OP_CAPTURE_NON_GREEDY_GROUP_END, + RE_OP_NON_CAPTURE_GROUP_START, + RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START, + RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START, + RE_OP_NON_CAPTURE_GREEDY_GROUP_END, + RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END, -#define RE_OP_CAPTURE_GROUP_START 1 -#define RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START 2 -#define RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START 3 -#define RE_OP_CAPTURE_GREEDY_GROUP_END 4 -#define RE_OP_CAPTURE_NON_GREEDY_GROUP_END 5 -#define RE_OP_NON_CAPTURE_GROUP_START 6 -#define RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START 7 -#define RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START 8 -#define RE_OP_NON_CAPTURE_GREEDY_GROUP_END 9 -#define RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END 10 - -#define RE_OP_MATCH 11 -#define RE_OP_CHAR 12 -#define RE_OP_SAVE_AT_START 13 -#define RE_OP_SAVE_AND_MATCH 14 -#define RE_OP_PERIOD 15 -#define RE_OP_ALTERNATIVE 16 -#define RE_OP_GREEDY_ITERATOR 17 -#define RE_OP_NON_GREEDY_ITERATOR 18 -#define RE_OP_ASSERT_START 19 -#define RE_OP_ASSERT_END 20 -#define RE_OP_ASSERT_WORD_BOUNDARY 21 -#define RE_OP_ASSERT_NOT_WORD_BOUNDARY 22 -#define RE_OP_LOOKAHEAD_POS 23 -#define RE_OP_LOOKAHEAD_NEG 24 -#define RE_OP_BACKREFERENCE 25 -#define RE_OP_CHAR_CLASS 26 -#define RE_OP_INV_CHAR_CLASS 27 + RE_OP_MATCH, + RE_OP_CHAR, + RE_OP_SAVE_AT_START, + RE_OP_SAVE_AND_MATCH, + RE_OP_PERIOD, + RE_OP_ALTERNATIVE, + RE_OP_GREEDY_ITERATOR, + RE_OP_NON_GREEDY_ITERATOR, + RE_OP_ASSERT_START, + RE_OP_ASSERT_END, + RE_OP_ASSERT_WORD_BOUNDARY, + RE_OP_ASSERT_NOT_WORD_BOUNDARY, + RE_OP_LOOKAHEAD_POS, + RE_OP_LOOKAHEAD_NEG, + RE_OP_BACKREFERENCE, + RE_OP_CHAR_CLASS, + RE_OP_INV_CHAR_CLASS +} re_opcode_t; +/** + * Recursion limit of RegExp compiler + */ #define RE_COMPILE_RECURSION_LIMIT 100 +/** + * Check if a RegExp opcode is a capture group or not + */ #define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0) -typedef uint8_t re_opcode_t; /* type of RegExp opcodes */ -typedef uint8_t re_bytecode_t; /* type of standard bytecode elements (ex.: opcode parameters) */ +/** + * Type of bytecode elements + */ +typedef uint8_t re_bytecode_t; /** * Context of RegExp bytecode container - * - * FIXME: - * Add comments with description of the structure members */ typedef struct { - re_bytecode_t *block_start_p; - re_bytecode_t *block_end_p; - re_bytecode_t *current_p; + re_bytecode_t *block_start_p; /**< start of bytecode block */ + re_bytecode_t *block_end_p; /**< end of bytecode block */ + re_bytecode_t *current_p; /**< current position in bytecode */ } re_bytecode_ctx_t; /** * Context of RegExp compiler - * - * FIXME: - * Add comments with description of the structure members */ typedef struct { - uint8_t flags; - uint32_t recursion_depth; - uint32_t num_of_captures; - uint32_t num_of_non_captures; - uint32_t highest_backref; - re_bytecode_ctx_t *bytecode_ctx_p; - re_token_t current_token; - re_parser_ctx_t *parser_ctx_p; + uint8_t flags; /**< RegExp flags */ + uint32_t recursion_depth; /**< recursion depth */ + uint32_t num_of_captures; /**< number of capture groups */ + uint32_t num_of_non_captures; /**< number of non-capture groups */ + uint32_t highest_backref; /**< highest backreference */ + re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */ + re_token_t current_token; /**< current token */ + re_parser_ctx_t *parser_ctx_p; /**< pointer of RegExp parser context */ } re_compiler_ctx_t; ecma_completion_value_t diff --git a/jerry-core/parser/regexp/re-parser.cpp b/jerry-core/parser/regexp/re-parser.cpp index 77a7820d7..a275b969f 100644 --- a/jerry-core/parser/regexp/re-parser.cpp +++ b/jerry-core/parser/regexp/re-parser.cpp @@ -32,14 +32,19 @@ /* FIXME: change it, when unicode support would be implemented */ #define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0) +/** + * Get next input character + * + * @return ecma_char_t + */ static ecma_char_t -get_ecma_char (lit_utf8_byte_t **char_p) +re_get_ecma_char (lit_utf8_byte_t **char_p) /**< pointer of input string */ { /* FIXME: change to string iterator with unicode support, when it would be implemented */ ecma_char_t ch = **char_p; RE_ADVANCE (*char_p, 1); return ch; -} /* get_ecma_char */ +} /* re_get_ecma_char */ /** * Parse RegExp iterators @@ -48,7 +53,7 @@ get_ecma_char (lit_utf8_byte_t **char_p) * Returned value must be freed with ecma_free_completion_value */ static ecma_completion_value_t -parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ +re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ re_token_t *re_token_p, /**< output token */ uint32_t lookup, /**< size of lookup */ uint32_t *advance_p) /**< output length of current advance */ @@ -64,6 +69,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ { re_token_p->qmin = 0; re_token_p->qmax = 1; + if (ch1 == '?') { *advance_p = 2; @@ -80,6 +86,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ { re_token_p->qmin = 0; re_token_p->qmax = RE_ITERATOR_INFINITE; + if (ch1 == '?') { *advance_p = 2; @@ -96,6 +103,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ { re_token_p->qmin = 1; re_token_p->qmax = RE_ITERATOR_INFINITE; + if (ch1 == '?') { *advance_p = 2; @@ -113,6 +121,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ uint32_t qmin = 0; uint32_t qmax = RE_ITERATOR_INFINITE; uint32_t digits = 0; + while (true) { (*advance_p)++; @@ -212,7 +221,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */ } return ret_value; -} /* parse_re_iterator */ +} /* re_parse_iterator */ /** * Count the number of groups in pattern @@ -224,17 +233,17 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex ecma_char_t ch1; int char_class_in = 0; parser_ctx_p->num_of_groups = 0; + ch1 = re_get_ecma_char (&pattern_p); - ch1 = get_ecma_char (&pattern_p); while (ch1 != LIT_CHAR_NULL) { ecma_char_t ch0 = ch1; - ch1 = get_ecma_char (&pattern_p); + ch1 = re_get_ecma_char (&pattern_p); switch (ch0) { case '\\': { - ch1 = get_ecma_char (&pattern_p); + ch1 = re_get_ecma_char (&pattern_p); break; } case '[': @@ -286,7 +295,8 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ do { - ecma_char_t ch = get_ecma_char (pattern_p); + ecma_char_t ch = re_get_ecma_char (pattern_p); + if (ch == ']') { if (start != RE_CHAR_UNDEF) @@ -305,7 +315,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ } else if (ch == '\\') { - ch = get_ecma_char (pattern_p); + ch = re_get_ecma_char (pattern_p); if (ch == 'b') { @@ -333,7 +343,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ } else if (ch == 'c') { - ch = get_ecma_char (pattern_p); + ch = re_get_ecma_char (pattern_p); if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z')) { ch = (ch % 32); @@ -479,7 +489,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */ uint32_t advance = 0; ECMA_TRY_CATCH (empty, - parse_re_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->current_char_p, out_token_p, 0, &advance), @@ -502,8 +512,8 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * { ecma_completion_value_t ret_value = ecma_make_empty_completion_value (); uint32_t advance = 0; - ecma_char_t ch0 = *(parser_ctx_p->current_char_p); + switch (ch0) { case '|': @@ -527,7 +537,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * case '.': { ECMA_TRY_CATCH (empty, - parse_re_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->current_char_p, out_token_p, 1, &advance), @@ -574,6 +584,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * else if (ch1 == 'c') { ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + if ((ch2 >= 'A' && ch2 <= 'Z') || (ch2 >= 'a' && ch2 <= 'z')) { advance = 3; @@ -702,7 +713,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * uint32_t iter_adv = 0; ECMA_TRY_CATCH (empty, - parse_re_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->current_char_p, out_token_p, advance, &iter_adv), @@ -716,6 +727,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '?') { ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2); + if (ch2 == '=') { /* (?= */ @@ -746,7 +758,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * case ')': { ECMA_TRY_CATCH (empty, - parse_re_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->current_char_p, out_token_p, 1, &advance), @@ -786,7 +798,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * default: { ECMA_TRY_CATCH (empty, - parse_re_iterator (parser_ctx_p->current_char_p, + re_parse_iterator (parser_ctx_p->current_char_p, out_token_p, 1, &advance), diff --git a/jerry-core/parser/regexp/re-parser.h b/jerry-core/parser/regexp/re-parser.h index 13016a81b..2f1f739f6 100644 --- a/jerry-core/parser/regexp/re-parser.h +++ b/jerry-core/parser/regexp/re-parser.h @@ -21,32 +21,42 @@ #include "opcodes-dumper.h" -typedef uint8_t token_type_t; - -#define RE_TOK_EOF 0 /* EOF */ -#define RE_TOK_BACKREFERENCE 1 /* \[0..9] */ -#define RE_TOK_CHAR 2 /* any character */ -#define RE_TOK_ALTERNATIVE 3 /* | */ -#define RE_TOK_ASSERT_START 4 /* ^ */ -#define RE_TOK_ASSERT_END 5 /* $ */ -#define RE_TOK_PERIOD 6 /* . */ -#define RE_TOK_START_CAPTURE_GROUP 7 /* ( */ -#define RE_TOK_START_NON_CAPTURE_GROUP 8 /* (?: */ -#define RE_TOK_END_GROUP 9 /* ')' */ -#define RE_TOK_ASSERT_START_POS_LOOKAHEAD 10 /* (?= */ -#define RE_TOK_ASSERT_START_NEG_LOOKAHEAD 11 /* (?! */ -#define RE_TOK_ASSERT_WORD_BOUNDARY 12 /* \b */ -#define RE_TOK_ASSERT_NOT_WORD_BOUNDARY 13 /* \B */ -#define RE_TOK_DIGIT 14 /* \d */ -#define RE_TOK_NOT_DIGIT 15 /* \D */ -#define RE_TOK_WHITE 16 /* \s */ -#define RE_TOK_NOT_WHITE 17 /* \S */ -#define RE_TOK_WORD_CHAR 18 /* \w */ -#define RE_TOK_NOT_WORD_CHAR 19 /* \W */ -#define RE_TOK_START_CHAR_CLASS 20 /* [ ] */ -#define RE_TOK_START_INV_CHAR_CLASS 21 /* [^ ] */ - +/** + * RegExp token type definitions + */ +typedef enum +{ + RE_TOK_EOF, /* EOF */ + RE_TOK_BACKREFERENCE, /* \[0..9] */ + RE_TOK_CHAR, /* any character */ + RE_TOK_ALTERNATIVE, /* | */ + RE_TOK_ASSERT_START, /* ^ */ + RE_TOK_ASSERT_END, /* $ */ + RE_TOK_PERIOD, /* . */ + RE_TOK_START_CAPTURE_GROUP, /* ( */ + RE_TOK_START_NON_CAPTURE_GROUP, /* (?: */ + RE_TOK_END_GROUP, /* ')' */ + RE_TOK_ASSERT_START_POS_LOOKAHEAD, /* (?= */ + RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /* (?! */ + RE_TOK_ASSERT_WORD_BOUNDARY, /* \b */ + RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /* \B */ + RE_TOK_DIGIT, /* \d */ + RE_TOK_NOT_DIGIT, /* \D */ + RE_TOK_WHITE, /* \s */ + RE_TOK_NOT_WHITE, /* \S */ + RE_TOK_WORD_CHAR, /* \w */ + RE_TOK_NOT_WORD_CHAR, /* \W */ + RE_TOK_START_CHAR_CLASS, /* [ ] */ + RE_TOK_START_INV_CHAR_CLASS, /* [^ ] */ +} re_token_type_t; +/** + * RegExp constant of infinite + */ #define RE_ITERATOR_INFINITE ((uint32_t)-1) + +/** + * Maximum number of decimal escape digits + */ #define RE_MAX_RE_DECESC_DIGITS 9 /* FIXME: depends on unicode support */ @@ -60,21 +70,27 @@ typedef uint8_t token_type_t; #define RE_CONTROL_CHAR_FF 0x000c /* \f */ #define RE_CONTROL_CHAR_CR 0x000d /* \r */ +/** + * RegExp token type + */ typedef struct { - token_type_t type; - uint32_t value; - uint32_t qmin; - uint32_t qmax; - bool greedy; + re_token_type_t type; /**< type of the token */ + uint32_t value; /**< value of the token */ + uint32_t qmin; /**< minimum number of token iterations */ + uint32_t qmax; /**< maximum number of token iterations */ + bool greedy; /**< type of iteration */ } re_token_t; +/** + * RegExp parser context + */ typedef struct { - lit_utf8_byte_t *pattern_start_p; - lit_utf8_byte_t *current_char_p; - int num_of_groups; - uint32_t num_of_classes; + lit_utf8_byte_t *pattern_start_p; /**< start of input pattern string */ + lit_utf8_byte_t *current_char_p; /**< current character in input pattern */ + int num_of_groups; /**< number of groups */ + uint32_t num_of_classes; /**< number of character classes */ } re_parser_ctx_t; typedef void (*re_char_class_callback) (void *re_ctx_p, uint32_t start, uint32_t end);