Replace convert_string_to_token_transform_escape_seq with lexer_transform_escape_sequences that outputs transformed charset to specified buffer, instead of creating token from the charset.

Also the commit introduces lexer_convert_escape_sequence_digits_to_char, adds 'lexer' prefix to convert_single_escape_character and replaces character constants with corresponding LIT_CHAR_* definitions in the function.

JerryScript-DCO-1.0-Signed-off-by: Ruben Ayrapetyan r.ayrapetyan@samsung.com
This commit is contained in:
Ruben Ayrapetyan 2015-07-05 23:19:38 +03:00
parent b5d30de4d7
commit 6432e4d7cf

View File

@ -132,30 +132,316 @@ create_token (token_type type, /**< type of token */
} /* create_token */
/**
* Convert string to token of specified type
* Create token of specified type from charset
*
* @return token descriptor
*/
static token
convert_string_to_token (token_type tt, /**< token type */
const lit_utf8_byte_t *str_p, /**< characters buffer */
lit_utf8_size_t length) /**< string's length */
lexer_create_token_for_charset (token_type tt, /**< token type */
const lit_utf8_byte_t *charset_p, /**< charset buffer */
lit_utf8_size_t size) /**< size of the charset */
{
JERRY_ASSERT (str_p != NULL);
JERRY_ASSERT (charset_p != NULL);
literal_t lit = lit_find_literal_by_utf8_string (str_p, length);
literal_t lit = lit_find_literal_by_utf8_string (charset_p, size);
if (lit != NULL)
{
return create_token_from_lit (tt, lit);
}
lit = lit_create_literal_from_utf8_string (str_p, length);
lit = lit_create_literal_from_utf8_string (charset_p, size);
JERRY_ASSERT (lit->get_type () == LIT_STR_T
|| lit->get_type () == LIT_MAGIC_STR_T
|| lit->get_type () == LIT_MAGIC_STR_EX_T);
return create_token_from_lit (tt, lit);
}
} /* lexer_create_token_for_charset */
/**
* Try to decode specified character as SingleEscapeCharacter (ECMA-262, v5, 7.8.4)
*
* If specified character is a SingleEscapeCharacter, convert it according to ECMA-262 v5, Table 4.
* Otherwise, output it as is.
*
* @return true - if specified character is a SingleEscapeCharacter,
* false - otherwise.
*/
static bool
lexer_convert_single_escape_character (ecma_char_t c, /**< character to decode */
ecma_char_t *out_converted_char_p) /**< out: decoded character */
{
ecma_char_t converted_char;
bool is_single_escape_character = true;
switch (c)
{
case LIT_CHAR_LOWERCASE_B:
{
converted_char = LIT_CHAR_BS;
break;
}
case LIT_CHAR_LOWERCASE_T:
{
converted_char = LIT_CHAR_TAB;
break;
}
case LIT_CHAR_LOWERCASE_N:
{
converted_char = LIT_CHAR_LF;
break;
}
case LIT_CHAR_LOWERCASE_V:
{
converted_char = LIT_CHAR_VTAB;
break;
}
case LIT_CHAR_LOWERCASE_F:
{
converted_char = LIT_CHAR_FF;
break;
}
case LIT_CHAR_LOWERCASE_R:
{
converted_char = LIT_CHAR_CR;
break;
}
case LIT_CHAR_DOUBLE_QUOTE:
case LIT_CHAR_SINGLE_QUOTE:
case LIT_CHAR_BACKSLASH:
{
converted_char = c;
break;
}
default:
{
converted_char = c;
is_single_escape_character = false;
break;
}
}
if (out_converted_char_p != NULL)
{
*out_converted_char_p = converted_char;
}
return is_single_escape_character;
} /* lexer_convert_single_escape_character */
/**
* Transform specified number of hexadecimal digits pointed by string iterator to character code
*
* @return true - upon successful conversion,
* false - otherwise (characters, pointed by iterator, are not hexadecimal digits,
* or number of characters until end of string is less than specified).
*/
static bool
lexer_convert_escape_sequence_digits_to_char (lit_utf8_iterator_t *src_iter_p, /**< string iterator */
bool is_unicode_escape_sequence, /**< UnicodeEscapeSequence (true)
* or HexEscapeSequence (false) */
ecma_char_t *out_converted_char_p) /**< out: converted character */
{
uint16_t char_code = 0;
const uint32_t digits_num = is_unicode_escape_sequence ? 4 : 2;
for (uint32_t i = 0; i < digits_num; i++)
{
if (lit_utf8_iterator_is_eos (src_iter_p))
{
return false;
}
const ecma_char_t next_char = lit_utf8_iterator_read_next (src_iter_p);
if (!lit_char_is_hex_digit (next_char))
{
return false;
}
else
{
/*
* Check that highest 4 bits are zero, so the value would not overflow.
*/
JERRY_ASSERT ((char_code & 0xF000u) == 0);
char_code = (uint16_t) (char_code << 4u);
char_code = (uint16_t) (char_code + lit_char_hex_to_int (next_char));
}
}
*out_converted_char_p = (ecma_char_t) char_code;
return true;
} /* lexer_convert_escape_sequence_digits_to_char */
/**
* Transforming escape sequences in the charset, outputting converted string to specified buffer
*
* Note:
* Size of string with transformed escape sequences is always
* less or equal to size of corresponding source string.
*
* @return size of converted string
*/
static lit_utf8_size_t
lexer_transform_escape_sequences (const jerry_api_char_t *source_str_p, /**< string to convert,
* located in source buffer */
lit_utf8_size_t source_str_size, /**< size of the string and of the output buffer */
jerry_api_char_t *output_str_buf_p) /**< output buffer for converted string */
{
if (source_str_size == 0)
{
return 0;
}
else
{
JERRY_ASSERT (source_str_p != NULL);
}
lit_utf8_byte_t *output_str_buf_iter_p = output_str_buf_p;
const size_t output_str_buf_size = source_str_size;
bool is_correct_sequence = true;
lit_utf8_iterator_t source_str_iter = lit_utf8_iterator_create (source_str_p,
source_str_size);
ecma_char_t prev_converted_char = LIT_CHAR_NULL;
while (!lit_utf8_iterator_is_eos (&source_str_iter))
{
ecma_char_t converted_char;
const ecma_char_t next_char = lit_utf8_iterator_read_next (&source_str_iter);
if (next_char == LIT_CHAR_BACKSLASH)
{
if (lit_utf8_iterator_is_eos (&source_str_iter))
{
is_correct_sequence = false;
break;
}
const ecma_char_t char_after_next = lit_utf8_iterator_read_next (&source_str_iter);
if (lit_char_is_decimal_digit (char_after_next))
{
if (lit_char_is_octal_digit (char_after_next))
{
if (char_after_next == LIT_CHAR_0
&& (lit_utf8_iterator_is_eos (&source_str_iter)
|| !lit_char_is_octal_digit (lit_utf8_iterator_peek_next (&source_str_iter))))
{
lit_utf8_iterator_incr (&source_str_iter);
converted_char = LIT_CHAR_NULL;
}
else
{
/* Implementation-defined (ECMA-262 v5, B.1.2): octal escape sequences are not implemented */
is_correct_sequence = false;
break;
}
}
else
{
converted_char = char_after_next;
}
}
else if (char_after_next == LIT_CHAR_LOWERCASE_U
|| char_after_next == LIT_CHAR_LOWERCASE_X)
{
if (!lexer_convert_escape_sequence_digits_to_char (&source_str_iter,
char_after_next == LIT_CHAR_LOWERCASE_U,
&converted_char))
{
is_correct_sequence = false;
break;
}
}
else if (lit_char_is_line_terminator (char_after_next))
{
/* Skip \, followed by a LineTerminatorSequence (ECMA-262, v5, 7.3) */
if (char_after_next == LIT_CHAR_CR
&& !lit_utf8_iterator_is_eos (&source_str_iter))
{
if (lit_utf8_iterator_peek_next (&source_str_iter) == LIT_CHAR_LF)
{
lit_utf8_iterator_incr (&source_str_iter);
}
}
continue;
}
else
{
lexer_convert_single_escape_character (char_after_next, &converted_char);
}
}
else
{
converted_char = next_char;
}
if (lit_is_code_unit_high_surrogate (prev_converted_char)
&& lit_is_code_unit_low_surrogate (converted_char))
{
output_str_buf_iter_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_converted_char,
converted_char);
output_str_buf_iter_p += lit_code_point_to_utf8 (code_point, output_str_buf_iter_p);
}
else
{
output_str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, output_str_buf_iter_p);
JERRY_ASSERT (output_str_buf_iter_p <= output_str_buf_p + output_str_buf_size);
}
prev_converted_char = converted_char;
}
if (is_correct_sequence)
{
return (lit_utf8_size_t) (output_str_buf_iter_p - output_str_buf_p);
}
else
{
PARSE_ERROR ("Illegal escape sequence", source_str_p - buffer_start);
}
} /* lexer_transform_escape_sequences */
/**
* Create token of specified type from charset, transforming escape sequences.
*
* @return token descriptor
*/
static token
lexer_create_token_for_charset_transform_escape_sequences (token_type tt, /**< token type */
const lit_utf8_byte_t *charset_p, /**< charset buffer */
lit_utf8_size_t size) /**< size of the charset */
{
lit_utf8_byte_t *converted_str_p = (lit_utf8_byte_t *) jsp_mm_alloc (size);
lit_utf8_size_t converted_size = lexer_transform_escape_sequences (charset_p,
size,
converted_str_p);
token ret = lexer_create_token_for_charset (tt,
converted_str_p,
converted_size);
jsp_mm_free (converted_str_p);
return ret;
} /* lexer_create_token_for_charset_transform_escape_sequences */
/**
* Try to decode specified string as ReservedWord (ECMA-262 v5, 7.6.1)
@ -358,339 +644,116 @@ consume_char (void)
while (0)
/**
* Try to decode specified character as SingleEscapeCharacter (ECMA-262, v5, 7.8.4)
* Parse Identifier (ECMA-262 v5, 7.6) or ReservedWord (7.6.1; 7.8.1; 7.8.2).
*
* If specified character is a SingleEscapeCharacter, convert it according to ECMA-262 v5, Table 4.
* Otherwise, output it as is.
*
* @return true - if specified character is a SingleEscapeCharacter,
* false - otherwise.
*/
static bool
convert_single_escape_character (ecma_char_t c, /**< character to decode */
ecma_char_t *out_converted_char_p) /**< out: decoded character */
{
ecma_char_t converted_char;
bool is_single_escape_character = true;
switch (c)
{
case 'b':
{
converted_char = (ecma_char_t) '\b';
break;
}
case 't':
{
converted_char = (ecma_char_t) '\t';
break;
}
case 'n':
{
converted_char = (ecma_char_t) '\n';
break;
}
case 'v':
{
converted_char = (ecma_char_t) '\v';
break;
}
case 'f':
{
converted_char = (ecma_char_t) '\f';
break;
}
case 'r':
{
converted_char = (ecma_char_t) '\r';
break;
}
case '"':
case '\'':
case '\\':
{
converted_char = (ecma_char_t) c;
break;
}
default:
{
converted_char = (ecma_char_t) c;
is_single_escape_character = false;
break;
}
}
if (out_converted_char_p != NULL)
{
*out_converted_char_p = converted_char;
}
return is_single_escape_character;
} /* convert_single_escape_character */
/**
* Convert specified string to token of specified type, transforming escape sequences
*
* @return token descriptor
*/
static token
convert_string_to_token_transform_escape_seq (token_type tok_type, /**< type of token to produce */
const jerry_api_char_t *source_str_p, /**< string to convert,
* located in source buffer */
size_t source_str_size) /**< size of the string */
{
token ret;
if (source_str_size == 0)
{
return convert_string_to_token (tok_type,
lit_get_magic_string_utf8 (LIT_MAGIC_STRING__EMPTY),
0);
}
else
{
JERRY_ASSERT (source_str_p != NULL);
}
lit_utf8_byte_t *str_buf_p = (lit_utf8_byte_t*) jsp_mm_alloc (source_str_size);
lit_utf8_byte_t *str_buf_iter_p = str_buf_p;
/* const lit_utf8_byte_t *source_str_iter_p = source_str_p; */
lit_utf8_iterator_t source_str_iter = lit_utf8_iterator_create (source_str_p, (lit_utf8_size_t) source_str_size);
bool is_correct_sequence = true;
bool every_char_islower = true;
bool every_char_allowed_in_identifier = true;
ecma_char_t prev_converted_char = LIT_CHAR_NULL;
while (!lit_utf8_iterator_is_eos (&source_str_iter))
{
ecma_char_t converted_char = lit_utf8_iterator_read_next (&source_str_iter);
if (converted_char == '\\')
{
const ecma_char_t escape_character = lit_utf8_iterator_read_next (&source_str_iter);
if (isdigit (escape_character))
{
if (escape_character == '0')
{
converted_char = LIT_CHAR_NULL;
}
else
{
/* Implementation-defined (ECMA-262 v5, B.1.2): octal escape sequences are not implemented */
is_correct_sequence = false;
break;
}
}
else if (escape_character == 'u'
|| escape_character == 'x')
{
const uint32_t hex_chars_num = (escape_character == 'u' ? 4u : 2u);
if (lit_utf8_iterator_get_offset (&source_str_iter) + hex_chars_num > source_str_size)
{
is_correct_sequence = false;
break;
}
bool chars_are_hex = true;
uint16_t char_code = 0;
for (uint32_t i = 0; i < hex_chars_num; i++)
{
const ecma_char_t next_char = lit_utf8_iterator_read_next (&source_str_iter);
if (!isxdigit (next_char))
{
chars_are_hex = false;
break;
}
else
{
/*
* Check that highest 4 bits are zero, so the value would not overflow.
*/
JERRY_ASSERT ((char_code & 0xF000u) == 0);
char_code = (uint16_t) (char_code << 4u);
char_code = (uint16_t) (char_code + lit_char_hex_to_int (next_char));
}
}
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size);
JERRY_ASSERT (lit_utf8_iterator_get_offset (&source_str_iter) <= source_str_size);
if (!chars_are_hex)
{
is_correct_sequence = false;
break;
}
/*
* In CONFIG_ECMA_CHAR_ASCII mode size of ecma_char_t is 1 byte, so the conversion
* would ignore highest part of 2-byte value, and in CONFIG_ECMA_CHAR_UTF16 mode this
* would be just an assignment of 2-byte value.
*/
converted_char = (ecma_char_t) char_code;
}
else if (lit_char_is_line_terminator (escape_character))
{
if (str_buf_iter_p + 1 <= source_str_p + source_str_size)
{
ecma_char_t next_char = lit_utf8_iterator_peek_next (&source_str_iter);
if (escape_character == '\x0D'
&& next_char == '\x0A')
{
lit_utf8_iterator_incr (&source_str_iter);
}
}
continue;
}
else
{
convert_single_escape_character ((ecma_char_t) escape_character, &converted_char);
}
}
if (lit_is_code_unit_high_surrogate (prev_converted_char)
&& lit_is_code_unit_low_surrogate (converted_char))
{
str_buf_iter_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (prev_converted_char, converted_char);
str_buf_iter_p += lit_code_point_to_utf8 (code_point, str_buf_iter_p);
}
else
{
str_buf_iter_p += lit_code_unit_to_utf8 (converted_char, str_buf_iter_p);
JERRY_ASSERT (str_buf_iter_p <= str_buf_p + source_str_size);
}
prev_converted_char = converted_char;
if (!islower (converted_char))
{
every_char_islower = false;
if (!isalpha (converted_char)
&& !isdigit (converted_char)
&& converted_char != '$'
&& converted_char != '_')
{
every_char_allowed_in_identifier = false;
}
}
}
if (is_correct_sequence)
{
lit_utf8_size_t length = (lit_utf8_size_t) (str_buf_iter_p - str_buf_p);
ret = empty_token;
if (tok_type == TOK_NAME)
{
if (every_char_islower)
{
ret = lexer_parse_reserved_word (str_buf_p, length);
}
else if (!every_char_allowed_in_identifier)
{
PARSE_ERROR ("Malformed identifier name", source_str_p - buffer_start);
}
}
if (is_empty (ret))
{
ret = convert_string_to_token (tok_type, str_buf_p, length);
}
}
else
{
PARSE_ERROR ("Malformed escape sequence", source_str_p - buffer_start);
}
jsp_mm_free (str_buf_p);
return ret;
} /* convert_string_to_token_transform_escape_seq */
/**
* Parse identifier (ECMA-262 v5, 7.6) or keyword (7.6.1.1)
* @return TOK_NAME - for Identifier,
* TOK_KEYWORD - for Keyword or FutureReservedWord,
* TOK_NULL - for NullLiteral,
* TOK_BOOL - for BooleanLiteral
*/
static token
parse_name (void)
{
ecma_char_t c = (ecma_char_t) LA (0);
token known_token = empty_token;
JERRY_ASSERT (isalpha (c) || c == '$' || c == '_' || c == '\\');
new_token ();
bool is_escape_sequence_occured = false;
bool is_all_chars_were_lowercase_ascii = true;
while (true)
{
c = (ecma_char_t) LA (0);
if (!isalpha (c)
&& !isdigit (c)
&& c != '$'
&& c != '_'
&& c != '\\')
if (c == '\\')
{
consume_char ();
is_escape_sequence_occured = true;
bool is_unicode_escape_sequence = (LA (0) == 'u');
consume_char ();
if (is_unicode_escape_sequence)
{
/* UnicodeEscapeSequence */
if (!lexer_convert_escape_sequence_digits_to_char (&src_iter,
true,
&c))
{
PARSE_ERROR ("Malformed escape sequence", token_start - buffer_start);
}
else
{
/* c now contains character, encoded in the UnicodeEscapeSequence */
if (!isalpha (c)
&& !isdigit (c)
&& c != '$'
&& c != '_')
{
PARSE_ERROR ("Invalid character in identifier", token_start - buffer_start);
}
}
}
else
{
PARSE_ERROR ("Only unicode escape sequences are allowed in identifiers",
token_start - buffer_start);
}
}
else if (!isalpha (c)
&& !isdigit (c)
&& c != '$'
&& c != '_')
{
break;
}
else
{
consume_char ();
if (c == '\\')
if (!islower (c))
{
bool is_correct_sequence = (LA (0) == 'u');
if (is_correct_sequence)
{
consume_char ();
}
for (uint32_t i = 0;
is_correct_sequence && i < 4;
i++)
{
if (!isxdigit (LA (0)))
{
is_correct_sequence = false;
break;
}
consume_char ();
}
if (!is_correct_sequence)
{
PARSE_ERROR ("Malformed escape sequence", token_start - buffer_start);
}
is_all_chars_were_lowercase_ascii = false;
}
consume_char ();
}
}
const lit_utf8_size_t seq_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
known_token = convert_string_to_token_transform_escape_seq (TOK_NAME,
token_start,
seq_size);
const lit_utf8_size_t charset_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
token ret = empty_token;
if (!is_escape_sequence_occured
&& is_all_chars_were_lowercase_ascii)
{
/* Keyword or FutureReservedWord (TOK_KEYWORD), or boolean literal (TOK_BOOL), or null literal (TOK_NULL) */
ret = lexer_parse_reserved_word (token_start, charset_size);
}
if (is_empty (ret))
{
/* Identifier (TOK_NAME) */
if (!is_escape_sequence_occured)
{
ret = lexer_create_token_for_charset (TOK_NAME,
token_start,
charset_size);
}
else
{
ret = lexer_create_token_for_charset_transform_escape_sequences (TOK_NAME,
token_start,
charset_size);
}
}
token_start = NULL;
return known_token;
return ret;
} /* parse_name */
/* In this function we cannot use strtol function
@ -932,6 +995,8 @@ parse_string (void)
const bool is_double_quoted = (c == '"');
const char end_char = (is_double_quoted ? '"' : '\'');
bool is_escape_sequence_occured = false;
do
{
c = (ecma_char_t) LA (0);
@ -947,9 +1012,11 @@ parse_string (void)
}
else if (c == '\\')
{
is_escape_sequence_occured = true;
ecma_char_t nc = (ecma_char_t) LA (0);
if (convert_single_escape_character (nc, NULL))
if (lexer_convert_single_escape_character (nc, NULL))
{
consume_char ();
}
@ -971,12 +1038,23 @@ parse_string (void)
}
while (c != end_char);
const lit_utf8_size_t esc_seq_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
const lit_utf8_size_t charset_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
token_start) - 1;
token ret = convert_string_to_token_transform_escape_seq (TOK_STRING,
token_start,
esc_seq_size);
token ret;
if (!is_escape_sequence_occured)
{
ret = lexer_create_token_for_charset (TOK_STRING,
token_start,
charset_size);
}
else
{
ret = lexer_create_token_for_charset_transform_escape_sequences (TOK_STRING,
token_start,
charset_size);
}
token_start = NULL;
@ -1045,10 +1123,10 @@ parse_regexp (void)
consume_char ();
}
result = convert_string_to_token (TOK_REGEXP,
(const lit_utf8_byte_t *) token_start,
(lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
token_start));
result = lexer_create_token_for_charset (TOK_REGEXP,
(const lit_utf8_byte_t *) token_start,
(lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) -
token_start));
token_start = NULL;
return result;