Style fixes for RegExp engine.

JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com
This commit is contained in:
László Langó 2015-07-10 15:57:06 +02:00
parent 1e90f833f7
commit cf9d54545f
8 changed files with 385 additions and 312 deletions

View File

@ -33,6 +33,7 @@
/* Object identifier */
OBJECT_ID (ECMA_BUILTIN_ID_REGEXP_PROTOTYPE)
// ECMA-262 v5, 15.10.6.1
OBJECT_VALUE (LIT_MAGIC_STRING_CONSTRUCTOR,
ecma_builtin_get (ECMA_BUILTIN_ID_REGEXP),
ECMA_PROPERTY_WRITABLE,

View File

@ -82,6 +82,7 @@ NUMBER_VALUE (LIT_MAGIC_STRING_LASTINDEX_UL,
ECMA_PROPERTY_NOT_ENUMERABLE,
ECMA_PROPERTY_NOT_CONFIGURABLE)
// ECMA-262 v5, 15.10.5
NUMBER_VALUE (LIT_MAGIC_STRING_LENGTH,
2,
ECMA_PROPERTY_NOT_WRITABLE,

View File

@ -38,7 +38,7 @@
* @{
*/
/*
/**
* RegExp results are stored in an array of string pointers. If N is the number
* of groups then the length of the array is 2*N, because every group has a start
* and end. We have to handle those pointers.
@ -54,7 +54,9 @@
#define RE_GLOBAL_START_IDX 0
#define RE_GLOBAL_END_IDX 1
/* RegExp flags */
/**
* RegExp flags
*/
#define RE_FLAG_GLOBAL (1 << 0) /* ECMA-262 v5, 15.10.7.2 */
#define RE_FLAG_IGNORE_CASE (1 << 1) /* ECMA-262 v5, 15.10.7.3 */
#define RE_FLAG_MULTILINE (1 << 2) /* ECMA-262 v5, 15.10.7.4 */
@ -78,8 +80,8 @@ re_parse_regexp_flags (ecma_string_t *flags_str_p, /**< Input string with flags
MEM_DEFINE_LOCAL_ARRAY (flags_start_p, flags_str_size, lit_utf8_byte_t);
ecma_string_to_utf8_string (flags_str_p, flags_start_p, (ssize_t) flags_str_size);
lit_utf8_byte_t *flags_char_p = flags_start_p;
while (flags_char_p < flags_start_p + flags_str_size
&& ecma_is_completion_value_empty (ret_value))
{
@ -140,8 +142,8 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */
{
JERRY_ASSERT (pattern_p != NULL);
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
uint8_t flags = 0;
if (flags_str_p != NULL)
{
ECMA_TRY_CATCH (empty, re_parse_regexp_flags (flags_str_p, &flags), ret_value);
@ -233,43 +235,43 @@ ecma_op_create_regexp_object (ecma_string_t *pattern_p, /**< input pattern */
* Backtrack a unicode character
*/
static const lit_utf8_byte_t *
utf8_backtrack (const lit_utf8_byte_t *str_p)
re_utf8_backtrack (const lit_utf8_byte_t *str_p)
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
return --str_p;
} /* utf8_backtrack */
} /* re_utf8_backtrack */
/**
* Helper to get an input character and increase string pointer.
*/
static ecma_char_t
get_input_char (const lit_utf8_byte_t **char_p)
re_get_input_char (const lit_utf8_byte_t **char_p)
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
const lit_utf8_byte_t ch = **char_p;
(*char_p)++;
return ch;
} /* get_input_char */
} /* re_get_input_char */
/**
* Helper to get current input character, won't increase string pointer.
*/
static ecma_char_t
lookup_input_char (const lit_utf8_byte_t *str_p)
re_lookup_input_char (const lit_utf8_byte_t *str_p)
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
return *str_p;
} /* lookup_input_char */
} /* re_lookup_input_char */
/**
* Helper to get previous input character, won't decrease string pointer.
*/
static ecma_char_t
lookup_prev_char (const lit_utf8_byte_t *str_p)
re_lookup_prev_char (const lit_utf8_byte_t *str_p)
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
return *(--str_p);
} /* lookup_prev_char */
} /* re_lookup_prev_char */
/**
* Recursive function for RegExp matching. Tests for a regular expression
@ -319,7 +321,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
case RE_OP_CHAR:
{
uint32_t ch1 = re_get_value (&bc_p);
uint32_t ch2 = get_input_char (&str_p);
uint32_t ch2 = re_get_input_char (&str_p);
JERRY_DDLOG ("Character matching %d to %d: ", ch1, ch2);
if (ch2 == '\0' || ch1 != ch2)
@ -333,8 +335,9 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
case RE_OP_PERIOD:
{
uint32_t ch1 = get_input_char (&str_p);
uint32_t ch1 = re_get_input_char (&str_p);
JERRY_DDLOG ("Period matching '.' to %d: ", ch1);
if (ch1 == '\n' || ch1 == '\0')
{
JERRY_DDLOG ("fail\n");
@ -361,7 +364,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
if (lit_char_is_line_terminator (lookup_prev_char (str_p)))
if (lit_char_is_line_terminator (re_lookup_prev_char (str_p)))
{
JERRY_DDLOG ("match\n");
break;
@ -388,7 +391,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
if (lit_char_is_line_terminator (lookup_input_char (str_p)))
if (lit_char_is_line_terminator (re_lookup_input_char (str_p)))
{
JERRY_DDLOG ("match\n");
break; /* tail merge */
@ -409,7 +412,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
else
{
is_wordchar_left = lit_char_is_word_char (lookup_prev_char (str_p));
is_wordchar_left = lit_char_is_word_char (re_lookup_prev_char (str_p));
}
if (str_p >= re_ctx_p->input_end_p)
@ -418,7 +421,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
else
{
is_wordchar_right = lit_char_is_word_char (lookup_input_char (str_p));
is_wordchar_right = lit_char_is_word_char (re_lookup_input_char (str_p));
}
if (op == RE_OP_ASSERT_WORD_BOUNDARY)
@ -460,6 +463,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
do
{
uint32_t offset = re_get_value (&bc_p);
if (!sub_str_p)
{
match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
@ -492,6 +496,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
if (!ecma_is_completion_value_throw (match_value))
{
re_ctx_p->recursion_depth--;
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -522,10 +527,11 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
curr_ch = get_input_char (&str_p);
curr_ch = re_get_input_char (&str_p);
num_of_ranges = re_get_value (&bc_p);
is_match = false;
while (num_of_ranges)
{
uint32_t ch1, ch2;
@ -581,6 +587,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
sub_str_p = re_ctx_p->saved_p[backref_idx];
while (sub_str_p < re_ctx_p->saved_p[backref_idx + 1])
{
uint32_t ch1, ch2;
@ -592,8 +599,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
ch1 = get_input_char (&sub_str_p);
ch2 = get_input_char (&str_p);
ch1 = re_get_input_char (&sub_str_p);
ch2 = re_get_input_char (&str_p);
if (ch1 != ch2)
{
@ -613,6 +620,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
JERRY_DDLOG ("Execute RE_OP_SAVE_AT_START\n");
old_start_p = re_ctx_p->saved_p[RE_GLOBAL_START_IDX];
re_ctx_p->saved_p[RE_GLOBAL_START_IDX] = str_p;
do
{
uint32_t offset = re_get_value (&bc_p);
@ -654,6 +662,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
uint32_t offset = re_get_value (&bc_p);
JERRY_DDLOG ("Execute RE_OP_ALTERNATIVE");
bc_p += offset;
while (*bc_p == RE_OP_ALTERNATIVE)
{
JERRY_DDLOG (", jump: %d");
@ -661,6 +670,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
offset = re_get_value (&bc_p);
bc_p += offset;
}
JERRY_DDLOG ("\n");
break; /* tail merge */
}
@ -695,13 +705,14 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
iter_idx = start_idx + (re_ctx_p->num_of_captures / 2) - 1;
start_idx += re_ctx_p->num_of_captures;
}
re_ctx_p->num_of_iterations[iter_idx] = 0;
re_ctx_p->num_of_iterations_p[iter_idx] = 0;
/* Jump all over to the end of the END opcode. */
bc_p += offset;
/* Try to match after the close paren if zero is allowed */
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -730,8 +741,8 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
const lit_utf8_byte_t *sub_str_p;
re_bytecode_t *old_bc_p;
re_bytecode_t *end_bc_p = NULL;
start_idx = re_get_value (&bc_p);
if (op != RE_OP_CAPTURE_GROUP_START
&& op != RE_OP_NON_CAPTURE_GROUP_START)
{
@ -752,9 +763,9 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
start_idx += re_ctx_p->num_of_captures;
}
old_start_p = re_ctx_p->saved_p[start_idx];
old_iteration_cnt = re_ctx_p->num_of_iterations[iter_idx];
old_iteration_cnt = re_ctx_p->num_of_iterations_p[iter_idx];
re_ctx_p->saved_p[start_idx] = str_p;
re_ctx_p->num_of_iterations[iter_idx] = 0;
re_ctx_p->num_of_iterations_p[iter_idx] = 0;
do
{
@ -775,7 +786,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
while (re_get_opcode (&bc_p) == RE_OP_ALTERNATIVE);
bc_p = old_bc_p;
re_ctx_p->num_of_iterations[iter_idx] = old_iteration_cnt;
re_ctx_p->num_of_iterations_p[iter_idx] = old_iteration_cnt;
/* Try to match after the close paren if zero is allowed. */
if (op == RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START
@ -783,6 +794,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
{
JERRY_ASSERT (end_bc_p);
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, end_bc_p, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -829,9 +841,10 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
end_idx += re_ctx_p->num_of_captures;
}
re_ctx_p->num_of_iterations[iter_idx]++;
if (re_ctx_p->num_of_iterations[iter_idx] >= min
&& re_ctx_p->num_of_iterations[iter_idx] <= max)
re_ctx_p->num_of_iterations_p[iter_idx]++;
if (re_ctx_p->num_of_iterations_p[iter_idx] >= min
&& re_ctx_p->num_of_iterations_p[iter_idx] <= max)
{
old_end_p = re_ctx_p->saved_p[end_idx];
re_ctx_p->saved_p[end_idx] = str_p;
@ -851,7 +864,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
re_ctx_p->saved_p[end_idx] = old_end_p;
}
re_ctx_p->num_of_iterations[iter_idx]--;
re_ctx_p->num_of_iterations_p[iter_idx]--;
bc_p = old_bc_p;
/* If non-greedy fails and try to iterate... */
@ -887,19 +900,20 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
/* Check the empty iteration if the minimum number of iterations is reached. */
if (re_ctx_p->num_of_iterations[iter_idx] >= min
if (re_ctx_p->num_of_iterations_p[iter_idx] >= min
&& str_p == re_ctx_p->saved_p[start_idx])
{
re_ctx_p->recursion_depth--;
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
re_ctx_p->num_of_iterations[iter_idx]++;
re_ctx_p->num_of_iterations_p[iter_idx]++;
old_bc_p = bc_p; /* Save the bytecode end position of the END opcodes for matching after it. */
old_end_p = re_ctx_p->saved_p[end_idx];
re_ctx_p->saved_p[end_idx] = str_p;
if (re_ctx_p->num_of_iterations[iter_idx] < max)
if (re_ctx_p->num_of_iterations_p[iter_idx] < max)
{
bc_p -= offset;
offset = re_get_value (&bc_p);
@ -907,6 +921,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
old_start_p = re_ctx_p->saved_p[start_idx];
re_ctx_p->saved_p[start_idx] = str_p;
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -931,6 +946,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
re_ctx_p->saved_p[start_idx] = str_p;
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -947,11 +963,12 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
}
if (re_ctx_p->num_of_iterations[iter_idx] >= min
&& re_ctx_p->num_of_iterations[iter_idx] <= max)
if (re_ctx_p->num_of_iterations_p[iter_idx] >= min
&& re_ctx_p->num_of_iterations_p[iter_idx] <= max)
{
/* Try to match the rest of the bytecode. */
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, old_bc_p, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -966,7 +983,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
/* restore if fails */
re_ctx_p->saved_p[end_idx] = old_end_p;
re_ctx_p->num_of_iterations[iter_idx]--;
re_ctx_p->num_of_iterations_p[iter_idx]--;
re_ctx_p->recursion_depth--;
return ecma_make_simple_completion_value (ECMA_SIMPLE_VALUE_FALSE); /* fail */
}
@ -988,6 +1005,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
if (num_of_iter >= min)
{
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -1001,6 +1019,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
}
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
if (!ecma_is_value_true (match_value))
{
if (ecma_is_completion_value_throw (match_value))
@ -1029,6 +1048,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
(unsigned long) min, (unsigned long) max, (long) offset);
num_of_iter = 0;
while (num_of_iter < max)
{
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p, str_p, &sub_str_p);
@ -1048,6 +1068,7 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
while (num_of_iter >= min)
{
ecma_completion_value_t match_value = re_match_regexp (re_ctx_p, bc_p + offset, str_p, &sub_str_p);
if (ecma_is_value_true (match_value))
{
*res_p = sub_str_p;
@ -1058,12 +1079,13 @@ re_match_regexp (re_matcher_ctx_t *re_ctx_p, /**< RegExp matcher context */
{
return match_value;
}
if (num_of_iter == min)
{
break;
}
str_p = utf8_backtrack (str_p);
str_p = re_utf8_backtrack (str_p);
num_of_iter--;
}
re_ctx_p->recursion_depth--;
@ -1206,6 +1228,7 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */
re_ctx.num_of_non_captures = re_get_value (&bc_p);
MEM_DEFINE_LOCAL_ARRAY (saved_p, re_ctx.num_of_captures + re_ctx.num_of_non_captures, const lit_utf8_byte_t *);
for (uint32_t i = 0; i < re_ctx.num_of_captures + re_ctx.num_of_non_captures; i++)
{
saved_p[i] = NULL;
@ -1214,13 +1237,14 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */
uint32_t num_of_iter_length = (re_ctx.num_of_captures / 2) + (re_ctx.num_of_non_captures - 1);
MEM_DEFINE_LOCAL_ARRAY (num_of_iter_p, num_of_iter_length, uint32_t);
for (uint32_t i = 0; i < num_of_iter_length; i++)
{
num_of_iter_p[i] = 0u;
}
bool is_match = false;
re_ctx.num_of_iterations = num_of_iter_p;
re_ctx.num_of_iterations_p = num_of_iter_p;
int32_t index = 0;
if (re_ctx.flags & RE_FLAG_GLOBAL)
@ -1236,6 +1260,7 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */
/* 2. Try to match */
const lit_utf8_byte_t *sub_str_p;
while (str_p && str_p <= re_ctx.input_end_p && ecma_is_completion_value_empty (ret_value))
{
if (index < 0 || index > (int32_t) input_size)
@ -1254,11 +1279,13 @@ ecma_regexp_exec_helper (ecma_object_t *obj_p, /**< RegExp object */
{
sub_str_p = NULL;
ECMA_TRY_CATCH (match_value, re_match_regexp (&re_ctx, bc_p, str_p, &sub_str_p), ret_value);
if (ecma_is_value_true (match_value))
{
is_match = true;
break;
}
str_p++;
index++;
ECMA_FINALIZE (match_value);

View File

@ -29,26 +29,30 @@
* @{
*/
#define RE_EXECUTE_RECURSION_LIMIT 1000 /* Limit of RegExp executor recursion depth */
#define RE_EXECUTE_MATCH_LIMIT 10000 /* Limit of RegExp execetur matching steps */
/**
* Limit of RegExp executor recursion depth
*/
#define RE_EXECUTE_RECURSION_LIMIT 1000
/**
* Limit of RegExp execetur matching steps
*/
#define RE_EXECUTE_MATCH_LIMIT 10000
/**
* RegExp executor context
*
* FIXME:
* Add comments with description of the structure members
*/
typedef struct
{
const lit_utf8_byte_t **saved_p;
const lit_utf8_byte_t *input_start_p;
const lit_utf8_byte_t *input_end_p;
uint32_t match_limit;
uint32_t recursion_depth;
uint32_t num_of_captures;
uint32_t num_of_non_captures;
uint32_t *num_of_iterations;
uint8_t flags;
const lit_utf8_byte_t **saved_p; /**< saved result string pointers, ECMA 262 v5, 15.10.2.1, State */
const lit_utf8_byte_t *input_start_p; /**< start of input pattern string */
const lit_utf8_byte_t *input_end_p; /**< end of input pattern string */
uint32_t match_limit; /**< matching limit counter */
uint32_t recursion_depth; /**< recursion depth counter */
uint32_t num_of_captures; /**< number of capture groups */
uint32_t num_of_non_captures; /**< number of non-capture groups */
uint32_t *num_of_iterations_p; /**< number of iterations */
uint8_t flags; /**< RegExp flags */
} re_matcher_ctx_t;
extern ecma_completion_value_t

View File

@ -25,26 +25,29 @@
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN
/**
* FIXME:
* Add comments to macro definitions in the component
* Size of block of RegExp bytecode. Used for allocation
*/
#define REGEXP_BYTECODE_BLOCK_SIZE 256UL
#define BYTECODE_LEN(bc_ctx_p) ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p))
void
regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
/**
* FIXME:
* Add missing 're' prefixes to the component's external and internal interfaces
* Get length of bytecode
*/
static uint32_t
re_get_bytecode_length (re_bytecode_ctx_t *bc_ctx_p)
{
return ((uint32_t) (bc_ctx_p->current_p - bc_ctx_p->block_start_p));
} /* re_get_bytecode_length */
void
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx);
/**
* Realloc the bytecode container
*
* @return current position in RegExp bytecode
*/
static re_bytecode_t*
realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
re_realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode context */
{
JERRY_ASSERT (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p >= 0);
size_t old_size = static_cast<size_t> (bc_ctx_p->block_end_p - bc_ctx_p->block_start_p);
@ -70,112 +73,113 @@ realloc_regexp_bytecode_block (re_bytecode_ctx_t *bc_ctx_p) /**< RegExp bytecode
bc_ctx_p->current_p = new_block_start_p + current_ptr_offset;
return bc_ctx_p->current_p;
} /* realloc_regexp_bytecode_block */
} /* re_realloc_regexp_bytecode_block */
/**
* Append a new bytecode to the and of the bytecode container
*/
static void
bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_bytecode_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
re_bytecode_list_append (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_bytecode_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
re_bytecode_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
current_p = realloc_regexp_bytecode_block (bc_ctx_p);
current_p = re_realloc_regexp_bytecode_block (bc_ctx_p);
}
memcpy (current_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* bytecode_list_append */
} /* re_bytecode_list_append */
/**
* Insert a new bytecode to the bytecode container
*/
static void
bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
size_t offset, /**< distance from the start of the container */
re_bytecode_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
re_bytecode_list_insert (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
size_t offset, /**< distance from the start of the container */
re_bytecode_t *bytecode_p, /**< input bytecode */
size_t length) /**< length of input */
{
JERRY_ASSERT (length <= REGEXP_BYTECODE_BLOCK_SIZE);
re_bytecode_t *current_p = bc_ctx_p->current_p;
if (current_p + length > bc_ctx_p->block_end_p)
{
realloc_regexp_bytecode_block (bc_ctx_p);
re_realloc_regexp_bytecode_block (bc_ctx_p);
}
re_bytecode_t *src_p = bc_ctx_p->block_start_p + offset;
if ((BYTECODE_LEN (bc_ctx_p) - offset) > 0)
if ((re_get_bytecode_length (bc_ctx_p) - offset) > 0)
{
re_bytecode_t *dest_p = src_p + length;
re_bytecode_t *tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((BYTECODE_LEN (bc_ctx_p) - offset),
MEM_HEAP_ALLOC_SHORT_TERM);
memcpy (tmp_block_start_p, src_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset));
memcpy (dest_p, tmp_block_start_p, (size_t) (BYTECODE_LEN (bc_ctx_p) - offset));
re_bytecode_t *tmp_block_start_p;
tmp_block_start_p = (re_bytecode_t *) mem_heap_alloc_block ((re_get_bytecode_length (bc_ctx_p) - offset),
MEM_HEAP_ALLOC_SHORT_TERM);
memcpy (tmp_block_start_p, src_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
memcpy (dest_p, tmp_block_start_p, (size_t) (re_get_bytecode_length (bc_ctx_p) - offset));
mem_heap_free_block (tmp_block_start_p);
}
memcpy (src_p, bytecode_p, length);
bc_ctx_p->current_p += length;
} /* bytecode_list_insert */
} /* re_bytecode_list_insert */
/**
* Append a RegExp opcode
*/
static void
append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_opcode_t opcode) /**< input opcode */
re_append_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
re_opcode_t opcode) /**< input opcode */
{
bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
} /* append_opcode */
re_bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
} /* re_append_opcode */
/**
* Append a parameter of a RegExp opcode
*/
static void
append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
re_append_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t));
} /* append_u32 */
re_bytecode_list_append (bc_ctx_p, (re_bytecode_t*) &value, sizeof (uint32_t));
} /* re_append_u32 */
/**
* Append a jump offset parameter of a RegExp opcode
*/
static void
append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
re_append_jump_offset (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t value) /**< input value */
{
value += (uint32_t) (sizeof (uint32_t));
append_u32 (bc_ctx_p, value);
} /* append_jump_offset */
re_append_u32 (bc_ctx_p, value);
} /* re_append_jump_offset */
/**
* Insert a RegExp opcode
*/
static void
insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
re_opcode_t opcode) /**< input opcode */
re_insert_opcode (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
re_opcode_t opcode) /**< input opcode */
{
bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
} /* insert_opcode */
re_bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &opcode, sizeof (re_bytecode_t));
} /* re_insert_opcode */
/**
* Insert a parameter of a RegExp opcode
*/
static void
insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
uint32_t value) /**< input value */
re_insert_u32 (re_bytecode_ctx_t *bc_ctx_p, /**< RegExp bytecode context */
uint32_t offset, /**< distance from the start of the container */
uint32_t value) /**< input value */
{
bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t));
} /* insert_u32 */
re_bytecode_list_insert (bc_ctx_p, offset, (re_bytecode_t*) &value, sizeof (uint32_t));
} /* re_insert_u32 */
/**
* Get a RegExp opcode
@ -203,23 +207,23 @@ re_get_value (re_bytecode_t **bc_p) /**< pointer to bytecode start */
* Callback function of character class generation
*/
static void
append_char_class (void* re_ctx_p, /**< RegExp compiler context */
uint32_t start, /**< character class range from */
uint32_t end) /**< character class range to */
re_append_char_class (void* re_ctx_p, /**< RegExp compiler context */
uint32_t start, /**< character class range from */
uint32_t end) /**< character class range to */
{
/* FIXME: Handle ignore case flag and add unicode support. */
re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t*) re_ctx_p;
append_u32 (ctx_p->bytecode_ctx_p, start);
append_u32 (ctx_p->bytecode_ctx_p, end);
re_append_u32 (ctx_p->bytecode_ctx_p, start);
re_append_u32 (ctx_p->bytecode_ctx_p, end);
ctx_p->parser_ctx_p->num_of_classes++;
} /* append_char_class */
} /* re_append_char_class */
/**
* Insert simple atom iterator
*/
static void
insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t new_atom_start_offset) /**< atom start offset */
re_insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t new_atom_start_offset) /**< atom start offset */
{
uint32_t atom_code_length;
uint32_t offset;
@ -231,30 +235,30 @@ insert_simple_iterator (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler contex
/* FIXME: optimize bytecode length. Store 0 rather than INF */
append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */
uint32_t bytecode_length = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
re_append_opcode (re_ctx_p->bytecode_ctx_p, RE_OP_MATCH); /* complete 'sub atom' */
uint32_t bytecode_length = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
atom_code_length = (uint32_t) (bytecode_length - new_atom_start_offset);
offset = new_atom_start_offset;
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length);
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax);
insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin);
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, atom_code_length);
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmax);
re_insert_u32 (re_ctx_p->bytecode_ctx_p, offset, qmin);
if (re_ctx_p->current_token.greedy)
{
insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR);
re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_GREEDY_ITERATOR);
}
else
{
insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR);
re_insert_opcode (re_ctx_p->bytecode_ctx_p, offset, RE_OP_NON_GREEDY_ITERATOR);
}
} /* insert_simple_iterator */
} /* re_insert_simple_iterator */
/**
* Get the type of a group start
*/
static re_opcode_t
get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool is_capturable) /**< is capturabel group */
re_get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool is_capturable) /**< is capturable group */
{
if (is_capturable)
{
@ -282,17 +286,14 @@ get_start_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context
}
return RE_OP_NON_CAPTURE_GROUP_START;
JERRY_UNREACHABLE ();
return 0;
} /* get_start_opcode_type */
} /* re_get_start_opcode_type */
/**
* Get the type of a group end
*/
static re_opcode_t
get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool is_capturable) /**< is capturabel group */
re_get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool is_capturable) /**< is capturable group */
{
if (is_capturable)
{
@ -310,64 +311,61 @@ get_end_opcode_type (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context *
}
return RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END;
JERRY_UNREACHABLE ();
return 0;
} /* get_end_opcode_type */
} /* re_get_end_opcode_type */
/**
* Enclose the given bytecode to a group
*/
static void
insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t group_start_offset, /**< offset of group start */
uint32_t idx, /**< index of group */
bool is_capturable) /**< is capturabel group */
re_insert_into_group (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t group_start_offset, /**< offset of group start */
uint32_t idx, /**< index of group */
bool is_capturable) /**< is capturable group */
{
uint32_t qmin, qmax;
re_opcode_t start_opcode = get_start_opcode_type (re_ctx_p, is_capturable);
re_opcode_t end_opcode = get_end_opcode_type (re_ctx_p, is_capturable);
re_opcode_t start_opcode = re_get_start_opcode_type (re_ctx_p, is_capturable);
re_opcode_t end_opcode = re_get_end_opcode_type (re_ctx_p, is_capturable);
uint32_t start_head_offset_len;
qmin = re_ctx_p->current_token.qmin;
qmax = re_ctx_p->current_token.qmax;
JERRY_ASSERT (qmin <= qmax);
start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx);
insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode);
start_head_offset_len = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - start_head_offset_len;
append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode);
append_u32 (re_ctx_p->bytecode_ctx_p, idx);
append_u32 (re_ctx_p->bytecode_ctx_p, qmin);
append_u32 (re_ctx_p->bytecode_ctx_p, qmax);
start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
re_insert_u32 (re_ctx_p->bytecode_ctx_p, group_start_offset, idx);
re_insert_opcode (re_ctx_p->bytecode_ctx_p, group_start_offset, start_opcode);
start_head_offset_len = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - start_head_offset_len;
re_append_opcode (re_ctx_p->bytecode_ctx_p, end_opcode);
re_append_u32 (re_ctx_p->bytecode_ctx_p, idx);
re_append_u32 (re_ctx_p->bytecode_ctx_p, qmin);
re_append_u32 (re_ctx_p->bytecode_ctx_p, qmax);
group_start_offset += start_head_offset_len;
append_jump_offset (re_ctx_p->bytecode_ctx_p,
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
re_append_jump_offset (re_ctx_p->bytecode_ctx_p,
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
if (start_opcode != RE_OP_CAPTURE_GROUP_START && start_opcode != RE_OP_NON_CAPTURE_GROUP_START)
{
insert_u32 (re_ctx_p->bytecode_ctx_p,
group_start_offset,
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
re_insert_u32 (re_ctx_p->bytecode_ctx_p,
group_start_offset,
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
}
} /* insert_into_group */
} /* re_insert_into_group */
/**
* Enclose the given bytecode to a group and inster jump value
*/
static void
insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t group_start_offset, /**< offset of group start */
uint32_t idx, /**< index of group */
bool is_capturable) /**< is capturabel group */
re_insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
uint32_t group_start_offset, /**< offset of group start */
uint32_t idx, /**< index of group */
bool is_capturable) /**< is capturable group */
{
insert_u32 (re_ctx_p->bytecode_ctx_p,
group_start_offset,
BYTECODE_LEN (re_ctx_p->bytecode_ctx_p) - group_start_offset);
insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
} /* insert_into_group_with_jump */
re_insert_u32 (re_ctx_p->bytecode_ctx_p,
group_start_offset,
re_get_bytecode_length (re_ctx_p->bytecode_ctx_p) - group_start_offset);
re_insert_into_group (re_ctx_p, group_start_offset, idx, is_capturable);
} /* re_insert_into_group_with_jump */
/**
* Parse alternatives
@ -376,14 +374,14 @@ insert_into_group_with_jump (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler c
* Returned value must be freed with ecma_free_completion_value
*/
static ecma_completion_value_t
parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool expect_eof) /**< expect end of file */
re_parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
bool expect_eof) /**< expect end of file */
{
uint32_t idx;
re_bytecode_ctx_t *bc_ctx_p = re_ctx_p->bytecode_ctx_p;
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
uint32_t alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
uint32_t alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
if (re_ctx_p->recursion_depth >= RE_COMPILE_RECURSION_LIMIT)
{
@ -399,11 +397,12 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
&(re_ctx_p->current_token)),
ret_value);
ECMA_FINALIZE (empty);
if (!ecma_is_completion_value_empty (ret_value))
{
return ret_value; /* error */
}
uint32_t new_atom_start_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
uint32_t new_atom_start_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
switch (re_ctx_p->current_token.type)
{
@ -412,10 +411,11 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
idx = re_ctx_p->num_of_captures++;
JERRY_DDLOG ("Compile a capture group start (idx: %d)\n", idx);
ret_value = parse_alternative (re_ctx_p, false);
ret_value = re_parse_alternative (re_ctx_p, false);
if (ecma_is_completion_value_empty (ret_value))
{
insert_into_group (re_ctx_p, new_atom_start_offset, idx, true);
re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, true);
}
else
{
@ -428,10 +428,11 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
idx = re_ctx_p->num_of_non_captures++;
JERRY_DDLOG ("Compile a non-capture group start (idx: %d)\n", idx);
ret_value = parse_alternative (re_ctx_p, false);
ret_value = re_parse_alternative (re_ctx_p, false);
if (ecma_is_completion_value_empty (ret_value))
{
insert_into_group (re_ctx_p, new_atom_start_offset, idx, false);
re_insert_into_group (re_ctx_p, new_atom_start_offset, idx, false);
}
else
{
@ -444,70 +445,71 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
JERRY_DDLOG ("Compile character token: %c, qmin: %d, qmax: %d\n",
re_ctx_p->current_token.value, re_ctx_p->current_token.qmin, re_ctx_p->current_token.qmax);
append_opcode (bc_ctx_p, RE_OP_CHAR);
append_u32 (bc_ctx_p, re_ctx_p->current_token.value);
re_append_opcode (bc_ctx_p, RE_OP_CHAR);
re_append_u32 (bc_ctx_p, re_ctx_p->current_token.value);
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
{
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
}
break;
}
case RE_TOK_PERIOD:
{
JERRY_DDLOG ("Compile a period\n");
append_opcode (bc_ctx_p, RE_OP_PERIOD);
re_append_opcode (bc_ctx_p, RE_OP_PERIOD);
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
{
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
}
break;
}
case RE_TOK_ALTERNATIVE:
{
JERRY_DDLOG ("Compile an alternative\n");
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE);
alterantive_offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
re_append_opcode (bc_ctx_p, RE_OP_ALTERNATIVE);
alterantive_offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
break;
}
case RE_TOK_ASSERT_START:
{
JERRY_DDLOG ("Compile a start assertion\n");
append_opcode (bc_ctx_p, RE_OP_ASSERT_START);
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_START);
break;
}
case RE_TOK_ASSERT_END:
{
JERRY_DDLOG ("Compile an end assertion\n");
append_opcode (bc_ctx_p, RE_OP_ASSERT_END);
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_END);
break;
}
case RE_TOK_ASSERT_WORD_BOUNDARY:
{
JERRY_DDLOG ("Compile a word boundary assertion\n");
append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_WORD_BOUNDARY);
break;
}
case RE_TOK_ASSERT_NOT_WORD_BOUNDARY:
{
JERRY_DDLOG ("Compile a not word boundary assertion\n");
append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
re_append_opcode (bc_ctx_p, RE_OP_ASSERT_NOT_WORD_BOUNDARY);
break;
}
case RE_TOK_ASSERT_START_POS_LOOKAHEAD:
{
JERRY_DDLOG ("Compile a positive lookahead assertion\n");
idx = re_ctx_p->num_of_non_captures++;
append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS);
re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_POS);
ret_value = re_parse_alternative (re_ctx_p, false);
ret_value = parse_alternative (re_ctx_p, false);
if (ecma_is_completion_value_empty (ret_value))
{
append_opcode (bc_ctx_p, RE_OP_MATCH);
re_append_opcode (bc_ctx_p, RE_OP_MATCH);
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
}
else
{
@ -519,14 +521,15 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
{
JERRY_DDLOG ("Compile a negative lookahead assertion\n");
idx = re_ctx_p->num_of_non_captures++;
append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG);
re_append_opcode (bc_ctx_p, RE_OP_LOOKAHEAD_NEG);
ret_value = re_parse_alternative (re_ctx_p, false);
ret_value = parse_alternative (re_ctx_p, false);
if (ecma_is_completion_value_empty (ret_value))
{
append_opcode (bc_ctx_p, RE_OP_MATCH);
re_append_opcode (bc_ctx_p, RE_OP_MATCH);
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
}
else
{
@ -538,39 +541,42 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
{
uint32_t backref = (uint32_t) re_ctx_p->current_token.value;
idx = re_ctx_p->num_of_non_captures++;
if (backref > re_ctx_p->highest_backref)
{
re_ctx_p->highest_backref = backref;
}
JERRY_DDLOG ("Compile a backreference: %d\n", backref);
append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE);
append_u32 (bc_ctx_p, backref);
insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
JERRY_DDLOG ("Compile a backreference: %d\n", backref);
re_append_opcode (bc_ctx_p, RE_OP_BACKREFERENCE);
re_append_u32 (bc_ctx_p, backref);
re_insert_into_group_with_jump (re_ctx_p, new_atom_start_offset, idx, false);
break;
}
case RE_TOK_START_CHAR_CLASS:
case RE_TOK_START_INV_CHAR_CLASS:
{
JERRY_DDLOG ("Compile a character class\n");
append_opcode (bc_ctx_p,
re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS
? RE_OP_CHAR_CLASS
: RE_OP_INV_CHAR_CLASS);
uint32_t offset = BYTECODE_LEN (re_ctx_p->bytecode_ctx_p);
re_append_opcode (bc_ctx_p,
re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS
? RE_OP_CHAR_CLASS
: RE_OP_INV_CHAR_CLASS);
uint32_t offset = re_get_bytecode_length (re_ctx_p->bytecode_ctx_p);
ECMA_TRY_CATCH (empty,
re_parse_char_class (re_ctx_p->parser_ctx_p,
append_char_class,
re_append_char_class,
re_ctx_p,
&(re_ctx_p->current_token)),
ret_value);
insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes);
re_insert_u32 (bc_ctx_p, offset, re_ctx_p->parser_ctx_p->num_of_classes);
if ((re_ctx_p->current_token.qmin != 1) || (re_ctx_p->current_token.qmax != 1))
{
insert_simple_iterator (re_ctx_p, new_atom_start_offset);
re_insert_simple_iterator (re_ctx_p, new_atom_start_offset);
}
ECMA_FINALIZE (empty);
break;
}
@ -584,7 +590,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
}
else
{
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
re_ctx_p->recursion_depth--;
}
@ -598,7 +604,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
}
else
{
insert_u32 (bc_ctx_p, alterantive_offset, BYTECODE_LEN (bc_ctx_p) - alterantive_offset);
re_insert_u32 (bc_ctx_p, alterantive_offset, re_get_bytecode_length (bc_ctx_p) - alterantive_offset);
re_ctx_p->recursion_depth--;
}
@ -614,7 +620,7 @@ parse_alternative (re_compiler_ctx_t *re_ctx_p, /**< RegExp compiler context */
JERRY_UNREACHABLE ();
return ret_value;
} /* parse_alternative */
} /* re_parse_alternative */
/**
* Compilation of RegExp bytecode
@ -656,9 +662,9 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
/* 1. Parse RegExp pattern */
re_ctx.num_of_captures = 1;
append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
re_append_opcode (&bc_ctx, RE_OP_SAVE_AT_START);
ECMA_TRY_CATCH (empty, parse_alternative (&re_ctx, true), ret_value);
ECMA_TRY_CATCH (empty, re_parse_alternative (&re_ctx, true), ret_value);
/* 2. Check for invalid backreference */
if (re_ctx.highest_backref >= re_ctx.num_of_captures)
@ -667,13 +673,13 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
}
else
{
append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
append_opcode (&bc_ctx, RE_OP_EOF);
re_append_opcode (&bc_ctx, RE_OP_SAVE_AND_MATCH);
re_append_opcode (&bc_ctx, RE_OP_EOF);
/* 3. Insert extra informations for bytecode header */
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures);
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2);
insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags);
re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_non_captures);
re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.num_of_captures * 2);
re_insert_u32 (&bc_ctx, 0, (uint32_t) re_ctx.flags);
}
ECMA_FINALIZE (empty);
@ -684,7 +690,7 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
MEM_FINALIZE_LOCAL_ARRAY (pattern_start_p);
#ifdef JERRY_ENABLE_LOG
regexp_dump_bytecode (&bc_ctx);
re_dump_bytecode (&bc_ctx);
#endif
return ret_value;
@ -695,7 +701,7 @@ re_compile_bytecode (ecma_property_t *bytecode_p, /**< bytecode */
* RegExp bytecode dumper
*/
void
regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p)
re_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p)
{
re_bytecode_t *bytecode_p = bc_ctx_p->block_start_p;
JERRY_DLOG ("%d ", re_get_value (&bytecode_p));
@ -889,7 +895,7 @@ regexp_dump_bytecode (re_bytecode_ctx_t *bc_ctx_p)
}
}
JERRY_DLOG ("EOF\n");
} /* regexp_dump_bytecode */
} /* re_dump_bytecode */
#endif /* JERRY_ENABLE_LOG */
#endif /* CONFIG_ECMA_COMPACT_PROFILE_DISABLE_REGEXP_BUILTIN */

View File

@ -22,77 +22,83 @@
#include "ecma-globals.h"
#include "re-parser.h"
/* RegExp opcodes
* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
* Change it carfully. Capture opcodes should be at first.
/**
* RegExp opcodes
*/
#define RE_OP_EOF 0
typedef enum
{
RE_OP_EOF,
/* Group opcode order is important, because RE_IS_CAPTURE_GROUP is based on it.
* Change it carefully. Capture opcodes should be at first.
*/
RE_OP_CAPTURE_GROUP_START,
RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START,
RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START,
RE_OP_CAPTURE_GREEDY_GROUP_END,
RE_OP_CAPTURE_NON_GREEDY_GROUP_END,
RE_OP_NON_CAPTURE_GROUP_START,
RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START,
RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START,
RE_OP_NON_CAPTURE_GREEDY_GROUP_END,
RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END,
#define RE_OP_CAPTURE_GROUP_START 1
#define RE_OP_CAPTURE_GREEDY_ZERO_GROUP_START 2
#define RE_OP_CAPTURE_NON_GREEDY_ZERO_GROUP_START 3
#define RE_OP_CAPTURE_GREEDY_GROUP_END 4
#define RE_OP_CAPTURE_NON_GREEDY_GROUP_END 5
#define RE_OP_NON_CAPTURE_GROUP_START 6
#define RE_OP_NON_CAPTURE_GREEDY_ZERO_GROUP_START 7
#define RE_OP_NON_CAPTURE_NON_GREEDY_ZERO_GROUP_START 8
#define RE_OP_NON_CAPTURE_GREEDY_GROUP_END 9
#define RE_OP_NON_CAPTURE_NON_GREEDY_GROUP_END 10
#define RE_OP_MATCH 11
#define RE_OP_CHAR 12
#define RE_OP_SAVE_AT_START 13
#define RE_OP_SAVE_AND_MATCH 14
#define RE_OP_PERIOD 15
#define RE_OP_ALTERNATIVE 16
#define RE_OP_GREEDY_ITERATOR 17
#define RE_OP_NON_GREEDY_ITERATOR 18
#define RE_OP_ASSERT_START 19
#define RE_OP_ASSERT_END 20
#define RE_OP_ASSERT_WORD_BOUNDARY 21
#define RE_OP_ASSERT_NOT_WORD_BOUNDARY 22
#define RE_OP_LOOKAHEAD_POS 23
#define RE_OP_LOOKAHEAD_NEG 24
#define RE_OP_BACKREFERENCE 25
#define RE_OP_CHAR_CLASS 26
#define RE_OP_INV_CHAR_CLASS 27
RE_OP_MATCH,
RE_OP_CHAR,
RE_OP_SAVE_AT_START,
RE_OP_SAVE_AND_MATCH,
RE_OP_PERIOD,
RE_OP_ALTERNATIVE,
RE_OP_GREEDY_ITERATOR,
RE_OP_NON_GREEDY_ITERATOR,
RE_OP_ASSERT_START,
RE_OP_ASSERT_END,
RE_OP_ASSERT_WORD_BOUNDARY,
RE_OP_ASSERT_NOT_WORD_BOUNDARY,
RE_OP_LOOKAHEAD_POS,
RE_OP_LOOKAHEAD_NEG,
RE_OP_BACKREFERENCE,
RE_OP_CHAR_CLASS,
RE_OP_INV_CHAR_CLASS
} re_opcode_t;
/**
* Recursion limit of RegExp compiler
*/
#define RE_COMPILE_RECURSION_LIMIT 100
/**
* Check if a RegExp opcode is a capture group or not
*/
#define RE_IS_CAPTURE_GROUP(x) (((x) < RE_OP_NON_CAPTURE_GROUP_START) ? 1 : 0)
typedef uint8_t re_opcode_t; /* type of RegExp opcodes */
typedef uint8_t re_bytecode_t; /* type of standard bytecode elements (ex.: opcode parameters) */
/**
* Type of bytecode elements
*/
typedef uint8_t re_bytecode_t;
/**
* Context of RegExp bytecode container
*
* FIXME:
* Add comments with description of the structure members
*/
typedef struct
{
re_bytecode_t *block_start_p;
re_bytecode_t *block_end_p;
re_bytecode_t *current_p;
re_bytecode_t *block_start_p; /**< start of bytecode block */
re_bytecode_t *block_end_p; /**< end of bytecode block */
re_bytecode_t *current_p; /**< current position in bytecode */
} re_bytecode_ctx_t;
/**
* Context of RegExp compiler
*
* FIXME:
* Add comments with description of the structure members
*/
typedef struct
{
uint8_t flags;
uint32_t recursion_depth;
uint32_t num_of_captures;
uint32_t num_of_non_captures;
uint32_t highest_backref;
re_bytecode_ctx_t *bytecode_ctx_p;
re_token_t current_token;
re_parser_ctx_t *parser_ctx_p;
uint8_t flags; /**< RegExp flags */
uint32_t recursion_depth; /**< recursion depth */
uint32_t num_of_captures; /**< number of capture groups */
uint32_t num_of_non_captures; /**< number of non-capture groups */
uint32_t highest_backref; /**< highest backreference */
re_bytecode_ctx_t *bytecode_ctx_p; /**< pointer of RegExp bytecode context */
re_token_t current_token; /**< current token */
re_parser_ctx_t *parser_ctx_p; /**< pointer of RegExp parser context */
} re_compiler_ctx_t;
ecma_completion_value_t

View File

@ -32,14 +32,19 @@
/* FIXME: change it, when unicode support would be implemented */
#define RE_ADVANCE(str_p, advance) do { str_p += advance; } while (0)
/**
* Get next input character
*
* @return ecma_char_t
*/
static ecma_char_t
get_ecma_char (lit_utf8_byte_t **char_p)
re_get_ecma_char (lit_utf8_byte_t **char_p) /**< pointer of input string */
{
/* FIXME: change to string iterator with unicode support, when it would be implemented */
ecma_char_t ch = **char_p;
RE_ADVANCE (*char_p, 1);
return ch;
} /* get_ecma_char */
} /* re_get_ecma_char */
/**
* Parse RegExp iterators
@ -48,7 +53,7 @@ get_ecma_char (lit_utf8_byte_t **char_p)
* Returned value must be freed with ecma_free_completion_value
*/
static ecma_completion_value_t
parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
re_parse_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
re_token_t *re_token_p, /**< output token */
uint32_t lookup, /**< size of lookup */
uint32_t *advance_p) /**< output length of current advance */
@ -64,6 +69,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
{
re_token_p->qmin = 0;
re_token_p->qmax = 1;
if (ch1 == '?')
{
*advance_p = 2;
@ -80,6 +86,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
{
re_token_p->qmin = 0;
re_token_p->qmax = RE_ITERATOR_INFINITE;
if (ch1 == '?')
{
*advance_p = 2;
@ -96,6 +103,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
{
re_token_p->qmin = 1;
re_token_p->qmax = RE_ITERATOR_INFINITE;
if (ch1 == '?')
{
*advance_p = 2;
@ -113,6 +121,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
uint32_t qmin = 0;
uint32_t qmax = RE_ITERATOR_INFINITE;
uint32_t digits = 0;
while (true)
{
(*advance_p)++;
@ -212,7 +221,7 @@ parse_re_iterator (lit_utf8_byte_t *pattern_p, /**< RegExp pattern */
}
return ret_value;
} /* parse_re_iterator */
} /* re_parse_iterator */
/**
* Count the number of groups in pattern
@ -224,17 +233,17 @@ re_count_num_of_groups (re_parser_ctx_t *parser_ctx_p) /**< RegExp parser contex
ecma_char_t ch1;
int char_class_in = 0;
parser_ctx_p->num_of_groups = 0;
ch1 = re_get_ecma_char (&pattern_p);
ch1 = get_ecma_char (&pattern_p);
while (ch1 != LIT_CHAR_NULL)
{
ecma_char_t ch0 = ch1;
ch1 = get_ecma_char (&pattern_p);
ch1 = re_get_ecma_char (&pattern_p);
switch (ch0)
{
case '\\':
{
ch1 = get_ecma_char (&pattern_p);
ch1 = re_get_ecma_char (&pattern_p);
break;
}
case '[':
@ -286,7 +295,8 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
do
{
ecma_char_t ch = get_ecma_char (pattern_p);
ecma_char_t ch = re_get_ecma_char (pattern_p);
if (ch == ']')
{
if (start != RE_CHAR_UNDEF)
@ -305,7 +315,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
}
else if (ch == '\\')
{
ch = get_ecma_char (pattern_p);
ch = re_get_ecma_char (pattern_p);
if (ch == 'b')
{
@ -333,7 +343,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
}
else if (ch == 'c')
{
ch = get_ecma_char (pattern_p);
ch = re_get_ecma_char (pattern_p);
if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
{
ch = (ch % 32);
@ -479,7 +489,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
uint32_t advance = 0;
ECMA_TRY_CATCH (empty,
parse_re_iterator (parser_ctx_p->current_char_p,
re_parse_iterator (parser_ctx_p->current_char_p,
out_token_p,
0,
&advance),
@ -502,8 +512,8 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
{
ecma_completion_value_t ret_value = ecma_make_empty_completion_value ();
uint32_t advance = 0;
ecma_char_t ch0 = *(parser_ctx_p->current_char_p);
switch (ch0)
{
case '|':
@ -527,7 +537,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
case '.':
{
ECMA_TRY_CATCH (empty,
parse_re_iterator (parser_ctx_p->current_char_p,
re_parse_iterator (parser_ctx_p->current_char_p,
out_token_p,
1,
&advance),
@ -574,6 +584,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
else if (ch1 == 'c')
{
ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2);
if ((ch2 >= 'A' && ch2 <= 'Z') || (ch2 >= 'a' && ch2 <= 'z'))
{
advance = 3;
@ -702,7 +713,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
uint32_t iter_adv = 0;
ECMA_TRY_CATCH (empty,
parse_re_iterator (parser_ctx_p->current_char_p,
re_parse_iterator (parser_ctx_p->current_char_p,
out_token_p,
advance,
&iter_adv),
@ -716,6 +727,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
if (RE_LOOKUP (parser_ctx_p->current_char_p, 1) == '?')
{
ecma_char_t ch2 = RE_LOOKUP (parser_ctx_p->current_char_p, 2);
if (ch2 == '=')
{
/* (?= */
@ -746,7 +758,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
case ')':
{
ECMA_TRY_CATCH (empty,
parse_re_iterator (parser_ctx_p->current_char_p,
re_parse_iterator (parser_ctx_p->current_char_p,
out_token_p,
1,
&advance),
@ -786,7 +798,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
default:
{
ECMA_TRY_CATCH (empty,
parse_re_iterator (parser_ctx_p->current_char_p,
re_parse_iterator (parser_ctx_p->current_char_p,
out_token_p,
1,
&advance),

View File

@ -21,32 +21,42 @@
#include "opcodes-dumper.h"
typedef uint8_t token_type_t;
#define RE_TOK_EOF 0 /* EOF */
#define RE_TOK_BACKREFERENCE 1 /* \[0..9] */
#define RE_TOK_CHAR 2 /* any character */
#define RE_TOK_ALTERNATIVE 3 /* | */
#define RE_TOK_ASSERT_START 4 /* ^ */
#define RE_TOK_ASSERT_END 5 /* $ */
#define RE_TOK_PERIOD 6 /* . */
#define RE_TOK_START_CAPTURE_GROUP 7 /* ( */
#define RE_TOK_START_NON_CAPTURE_GROUP 8 /* (?: */
#define RE_TOK_END_GROUP 9 /* ')' */
#define RE_TOK_ASSERT_START_POS_LOOKAHEAD 10 /* (?= */
#define RE_TOK_ASSERT_START_NEG_LOOKAHEAD 11 /* (?! */
#define RE_TOK_ASSERT_WORD_BOUNDARY 12 /* \b */
#define RE_TOK_ASSERT_NOT_WORD_BOUNDARY 13 /* \B */
#define RE_TOK_DIGIT 14 /* \d */
#define RE_TOK_NOT_DIGIT 15 /* \D */
#define RE_TOK_WHITE 16 /* \s */
#define RE_TOK_NOT_WHITE 17 /* \S */
#define RE_TOK_WORD_CHAR 18 /* \w */
#define RE_TOK_NOT_WORD_CHAR 19 /* \W */
#define RE_TOK_START_CHAR_CLASS 20 /* [ ] */
#define RE_TOK_START_INV_CHAR_CLASS 21 /* [^ ] */
/**
* RegExp token type definitions
*/
typedef enum
{
RE_TOK_EOF, /* EOF */
RE_TOK_BACKREFERENCE, /* \[0..9] */
RE_TOK_CHAR, /* any character */
RE_TOK_ALTERNATIVE, /* | */
RE_TOK_ASSERT_START, /* ^ */
RE_TOK_ASSERT_END, /* $ */
RE_TOK_PERIOD, /* . */
RE_TOK_START_CAPTURE_GROUP, /* ( */
RE_TOK_START_NON_CAPTURE_GROUP, /* (?: */
RE_TOK_END_GROUP, /* ')' */
RE_TOK_ASSERT_START_POS_LOOKAHEAD, /* (?= */
RE_TOK_ASSERT_START_NEG_LOOKAHEAD, /* (?! */
RE_TOK_ASSERT_WORD_BOUNDARY, /* \b */
RE_TOK_ASSERT_NOT_WORD_BOUNDARY, /* \B */
RE_TOK_DIGIT, /* \d */
RE_TOK_NOT_DIGIT, /* \D */
RE_TOK_WHITE, /* \s */
RE_TOK_NOT_WHITE, /* \S */
RE_TOK_WORD_CHAR, /* \w */
RE_TOK_NOT_WORD_CHAR, /* \W */
RE_TOK_START_CHAR_CLASS, /* [ ] */
RE_TOK_START_INV_CHAR_CLASS, /* [^ ] */
} re_token_type_t;
/**
* RegExp constant of infinite
*/
#define RE_ITERATOR_INFINITE ((uint32_t)-1)
/**
* Maximum number of decimal escape digits
*/
#define RE_MAX_RE_DECESC_DIGITS 9
/* FIXME: depends on unicode support */
@ -60,21 +70,27 @@ typedef uint8_t token_type_t;
#define RE_CONTROL_CHAR_FF 0x000c /* \f */
#define RE_CONTROL_CHAR_CR 0x000d /* \r */
/**
* RegExp token type
*/
typedef struct
{
token_type_t type;
uint32_t value;
uint32_t qmin;
uint32_t qmax;
bool greedy;
re_token_type_t type; /**< type of the token */
uint32_t value; /**< value of the token */
uint32_t qmin; /**< minimum number of token iterations */
uint32_t qmax; /**< maximum number of token iterations */
bool greedy; /**< type of iteration */
} re_token_t;
/**
* RegExp parser context
*/
typedef struct
{
lit_utf8_byte_t *pattern_start_p;
lit_utf8_byte_t *current_char_p;
int num_of_groups;
uint32_t num_of_classes;
lit_utf8_byte_t *pattern_start_p; /**< start of input pattern string */
lit_utf8_byte_t *current_char_p; /**< current character in input pattern */
int num_of_groups; /**< number of groups */
uint32_t num_of_classes; /**< number of character classes */
} re_parser_ctx_t;
typedef void (*re_char_class_callback) (void *re_ctx_p, uint32_t start, uint32_t end);