diff --git a/jerry-core/api/jerry-snapshot.c b/jerry-core/api/jerry-snapshot.c index f547b161d..cff61a822 100644 --- a/jerry-core/api/jerry-snapshot.c +++ b/jerry-core/api/jerry-snapshot.c @@ -1553,31 +1553,59 @@ jerry_append_number_to_buffer (uint8_t *buffer_p, /**< buffer */ static bool ecma_string_is_valid_identifier (const ecma_string_t *string_p) { - bool result = false; - ECMA_STRING_TO_UTF8_STRING (string_p, str_buffer_p, str_buffer_size); - if (lit_char_is_identifier_start (str_buffer_p)) + const uint8_t *str_p = str_buffer_p; + const uint8_t *str_end_p = str_buffer_p + str_buffer_size; + + while (str_p < str_end_p) { - const uint8_t *str_start_p = str_buffer_p; - const uint8_t *str_end_p = str_buffer_p + str_buffer_size; + lit_code_point_t code_point = *str_p; + lit_utf8_size_t utf8_length = 1; - result = true; - - while (str_start_p < str_end_p) + if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER)) { - if (!lit_char_is_identifier_part (str_start_p)) + utf8_length = lit_read_code_point_from_utf8 (str_p, + (lit_utf8_size_t) (str_end_p - str_p), + &code_point); + +#if ENABLED (JERRY_ES2015) + if ((code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) + && str_p + 3 < str_end_p) + { + lit_code_point_t low_surrogate; + lit_read_code_point_from_utf8 (str_p + 3, + (lit_utf8_size_t) (str_end_p - (str_p + 3)), + &low_surrogate); + + if (low_surrogate >= LIT_UTF16_LOW_SURROGATE_MIN && low_surrogate <= LIT_UTF16_LOW_SURROGATE_MAX) + { + code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point, + (ecma_char_t) low_surrogate); + utf8_length = 2 * 3; + } + } +#endif /* ENABLED (JERRY_ES2015) */ + } + + if (str_p == str_buffer_p) + { + if (!lit_code_point_is_identifier_start (code_point)) { - result = false; break; } - lit_utf8_incr (&str_start_p); } + else if (!lit_code_point_is_identifier_part (code_point)) + { + break; + } + + str_p += utf8_length; } ECMA_FINALIZE_UTF8_STRING (str_buffer_p, str_buffer_size); - return result; + return str_p == str_end_p; } /* ecma_string_is_valid_identifier */ #endif /* ENABLED (JERRY_SNAPSHOT_SAVE) */ diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index 4d5a9eacd..f08975690 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -461,16 +461,9 @@ ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string if ((string_p[pos] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) { /* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */ - uint32_t character = ((((uint32_t) string_p[pos++]) & 0x7) << 18); - character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 12); - character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 6); - character |= (((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK); - - JERRY_ASSERT (character >= 0x10000); - character -= 0x10000; - - data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xd800 | (character >> 10))); - data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK))); + lit_four_byte_utf8_char_to_cesu8 (data_p, string_p + pos); + data_p += 3 * 2; + pos += 4; } else { @@ -2683,10 +2676,10 @@ void ecma_stringbuilder_append_char (ecma_stringbuilder_t *builder_p, /**< string builder */ const ecma_char_t c) /**< ecma char */ { - const lit_utf8_size_t size = (lit_utf8_size_t) lit_char_get_utf8_length (c); + const lit_utf8_size_t size = (lit_utf8_size_t) lit_code_point_get_cesu8_length (c); lit_utf8_byte_t *dest_p = ecma_stringbuilder_grow (builder_p, size); - lit_char_to_utf8_bytes (dest_p, c); + lit_code_point_to_cesu8_bytes (dest_p, c); } /* ecma_stringbuilder_append_char */ /** diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-date.c b/jerry-core/ecma/builtin-objects/ecma-builtin-date.c index c6c99580c..b4b839ef7 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-date.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-date.c @@ -61,7 +61,7 @@ ecma_date_parse_date_chars (const lit_utf8_byte_t **str_p, /**< pointer to the c while (num_of_chars--) { - if (*str_p >= str_end_p || !lit_char_is_decimal_digit (lit_utf8_read_next (str_p))) + if (*str_p >= str_end_p || !lit_char_is_decimal_digit (lit_cesu8_read_next (str_p))) { return ecma_number_make_nan (); } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c index f4aad0c98..717aa7383 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c @@ -150,7 +150,7 @@ ecma_builtin_global_object_parse_int (const lit_utf8_byte_t *string_buff, /**< r int sign = 1; /* 4. */ - ecma_char_t current = lit_utf8_read_next (&string_curr_p); + ecma_char_t current = lit_cesu8_read_next (&string_curr_p); if (current == LIT_CHAR_MINUS) { sign = -1; @@ -162,7 +162,7 @@ ecma_builtin_global_object_parse_int (const lit_utf8_byte_t *string_buff, /**< r start_p = string_curr_p; if (string_curr_p < string_end_p) { - current = lit_utf8_read_next (&string_curr_p); + current = lit_cesu8_read_next (&string_curr_p); } } @@ -970,7 +970,7 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine' while (input_curr_p < input_end_p) { - ecma_char_t chr = lit_utf8_read_next (&input_curr_p); + ecma_char_t chr = lit_cesu8_read_next (&input_curr_p); if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) { @@ -1005,7 +1005,7 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine' while (input_curr_p < input_end_p) { - ecma_char_t chr = lit_utf8_read_next (&input_curr_p); + ecma_char_t chr = lit_cesu8_read_next (&input_curr_p); if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) { @@ -1091,7 +1091,7 @@ ecma_builtin_global_object_unescape (lit_utf8_byte_t *input_start_p, /**< routin while (input_curr_p < input_end_p) { /* 6. */ - ecma_char_t chr = lit_utf8_read_next (&input_curr_p); + ecma_char_t chr = lit_cesu8_read_next (&input_curr_p); /* 7-8. */ if (status == 0 && chr == LIT_CHAR_PERCENT) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.c b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.c index ed389220b..cb184d923 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers.c @@ -713,7 +713,7 @@ ecma_builtin_helper_string_find_index (ecma_string_t *original_str_p, /**< index /* iterate original string and try to match at each position */ bool searching = true; - ecma_char_t first_char = lit_utf8_read_next (&search_str_curr_p); + ecma_char_t first_char = lit_cesu8_read_next (&search_str_curr_p); while (searching) { /* match as long as possible */ @@ -722,14 +722,14 @@ ecma_builtin_helper_string_find_index (ecma_string_t *original_str_p, /**< index if (match_len < search_len && index + match_len < original_len && - lit_utf8_read_next (&original_str_curr_p) == first_char) + lit_cesu8_read_next (&original_str_curr_p) == first_char) { const lit_utf8_byte_t *nested_search_str_curr_p = search_str_curr_p; match_len++; while (match_len < search_len && index + match_len < original_len && - lit_utf8_read_next (&original_str_curr_p) == lit_utf8_read_next (&nested_search_str_curr_p)) + lit_cesu8_read_next (&original_str_curr_p) == lit_cesu8_read_next (&nested_search_str_curr_p)) { match_len++; } diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c index 10231d4df..d230a5a58 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c @@ -1155,7 +1155,7 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str while (input_str_curr_p < input_str_end_p) { - ecma_char_t character = lit_utf8_read_next (&input_str_curr_p); + ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p); ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; ecma_length_t character_length; lit_utf8_byte_t utf8_byte_buffer[LIT_CESU8_MAX_BYTES_IN_CODE_POINT]; @@ -1194,7 +1194,7 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_string_t *input_str while (input_str_curr_p < input_str_end_p) { - ecma_char_t character = lit_utf8_read_next (&input_str_curr_p); + ecma_char_t character = lit_cesu8_read_next (&input_str_curr_p); ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH]; ecma_length_t character_length; diff --git a/jerry-core/ecma/operations/ecma-regexp-object.c b/jerry-core/ecma/operations/ecma-regexp-object.c index a0b4ba221..40eaa740a 100644 --- a/jerry-core/ecma/operations/ecma-regexp-object.c +++ b/jerry-core/ecma/operations/ecma-regexp-object.c @@ -220,11 +220,11 @@ ecma_regexp_unicode_advance (const lit_utf8_byte_t **str_p, /**< reference to st JERRY_ASSERT (str_p != NULL); const lit_utf8_byte_t *current_p = *str_p; - lit_code_point_t ch = lit_utf8_read_next (¤t_p); + lit_code_point_t ch = lit_cesu8_read_next (¤t_p); if (lit_is_code_point_utf16_high_surrogate ((ecma_char_t) ch) && current_p < end_p) { - const ecma_char_t next_ch = lit_utf8_peek_next (current_p); + const ecma_char_t next_ch = lit_cesu8_peek_next (current_p); if (lit_is_code_point_utf16_low_surrogate (next_ch)) { lit_utf8_incr (¤t_p); @@ -425,14 +425,14 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ const bool is_ignorecase = re_ctx_p->flags & RE_FLAG_IGNORE_CASE; lit_code_point_t ch1 = re_get_char (&bc_p); /* Already canonicalized. */ - lit_code_point_t ch2 = lit_utf8_read_next (&str_curr_p); + lit_code_point_t ch2 = lit_cesu8_read_next (&str_curr_p); #if ENABLED (JERRY_ES2015) if (re_ctx_p->flags & RE_FLAG_UNICODE && lit_is_code_point_utf16_high_surrogate (ch2) && str_curr_p < re_ctx_p->input_end_p) { - const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p); + const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p); if (lit_is_code_point_utf16_low_surrogate (next_ch)) { lit_utf8_incr (&str_curr_p); @@ -460,7 +460,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ return NULL; /* fail */ } - const ecma_char_t ch = lit_utf8_read_next (&str_curr_p); + const ecma_char_t ch = lit_cesu8_read_next (&str_curr_p); JERRY_TRACE_MSG ("Period matching '.' to %u: ", (unsigned int) ch); if (lit_char_is_line_terminator (ch)) @@ -474,7 +474,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ && lit_is_code_point_utf16_high_surrogate (ch) && str_curr_p < re_ctx_p->input_end_p) { - const ecma_char_t next_ch = lit_utf8_peek_next (str_curr_p); + const ecma_char_t next_ch = lit_cesu8_peek_next (str_curr_p); if (lit_is_code_point_utf16_low_surrogate (next_ch)) { lit_utf8_incr (&str_curr_p); @@ -501,7 +501,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ return NULL; /* fail */ } - if (lit_char_is_line_terminator (lit_utf8_peek_prev (str_curr_p))) + if (lit_char_is_line_terminator (lit_cesu8_peek_prev (str_curr_p))) { JERRY_TRACE_MSG ("match\n"); break; /* tail merge */ @@ -526,7 +526,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ return NULL; /* fail */ } - if (lit_char_is_line_terminator (lit_utf8_peek_next (str_curr_p))) + if (lit_char_is_line_terminator (lit_cesu8_peek_next (str_curr_p))) { JERRY_TRACE_MSG ("match\n"); break; /* tail merge */ @@ -539,10 +539,10 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ case RE_OP_ASSERT_NOT_WORD_BOUNDARY: { const bool is_wordchar_left = ((str_curr_p > re_ctx_p->input_start_p) - && lit_char_is_word_char (lit_utf8_peek_prev (str_curr_p))); + && lit_char_is_word_char (lit_cesu8_peek_prev (str_curr_p))); const bool is_wordchar_right = ((str_curr_p < re_ctx_p->input_end_p) - && lit_char_is_word_char (lit_utf8_peek_next (str_curr_p))); + && lit_char_is_word_char (lit_cesu8_peek_next (str_curr_p))); if (op == RE_OP_ASSERT_WORD_BOUNDARY) { @@ -659,7 +659,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ else { #endif /* ENABLED (JERRY_ES2015) */ - const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_utf8_read_next (&str_curr_p), + const ecma_char_t curr_ch = (ecma_char_t) ecma_regexp_canonicalize (lit_cesu8_read_next (&str_curr_p), is_ignorecase); while (range_count-- > 0) @@ -1115,7 +1115,7 @@ ecma_regexp_match (ecma_regexp_ctx_t *re_ctx_p, /**< RegExp matcher context */ break; } - lit_utf8_read_prev (&str_curr_p); + lit_cesu8_read_prev (&str_curr_p); iter_count--; } } diff --git a/jerry-core/lit/lit-char-helpers.c b/jerry-core/lit/lit-char-helpers.c index 29ec4faed..c87a8185a 100644 --- a/jerry-core/lit/lit-char-helpers.c +++ b/jerry-core/lit/lit-char-helpers.c @@ -200,73 +200,33 @@ lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */ NUM_OF_ELEMENTS (lit_unicode_non_letter_ident_part_chars))); } /* lit_char_is_unicode_non_letter_ident_part */ -/** - * Checks whether the next UTF8 character is a valid identifier start. - * - * @return true if it is. - */ -bool -lit_char_is_identifier_start (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */ -{ - if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) - { - return lit_char_is_identifier_start_character (*src_p); - } - - /* ECMAScript 2015 specification allows some code points in supplementary plane. - * However, we don't permit characters in supplementary characters as start of identifier. - */ - if ((*src_p & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) - { - return false; - } - - return lit_char_is_identifier_start_character (lit_utf8_peek_next (src_p)); -} /* lit_char_is_identifier_start */ - /** * Checks whether the character is a valid identifier start. * * @return true if it is. */ bool -lit_char_is_identifier_start_character (uint16_t chr) /**< EcmaScript character */ +lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */ { /* Fast path for ASCII-defined letters. */ - if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) + if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) { - return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A - && LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z) - || chr == LIT_CHAR_DOLLAR_SIGN - || chr == LIT_CHAR_UNDERSCORE); + return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A + && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z) + || code_point == LIT_CHAR_DOLLAR_SIGN + || code_point == LIT_CHAR_UNDERSCORE); } - return lit_char_is_unicode_letter (chr); -} /* lit_char_is_identifier_start_character */ - -/** - * Checks whether the next UTF8 character is a valid identifier part. - * - * @return true if it is. - */ -bool -lit_char_is_identifier_part (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */ -{ - if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) +#if ENABLED (JERRY_ES2015) + if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN) { - return lit_char_is_identifier_part_character (*src_p); + /* TODO: detect these ranges correctly. */ + return (code_point >= 0x10C80 && code_point <= 0x10CF2); } +#endif /* ENABLED (JERRY_ES2015) */ - /* ECMAScript 2015 specification allows some code points in supplementary plane. - * However, we don't permit characters in supplementary characters as part of identifier. - */ - if ((*src_p & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) - { - return false; - } - - return lit_char_is_identifier_part_character (lit_utf8_peek_next (src_p)); -} /* lit_char_is_identifier_part */ + return lit_char_is_unicode_letter ((ecma_char_t) code_point); +} /* lit_code_point_is_identifier_start */ /** * Checks whether the character is a valid identifier part. @@ -274,21 +234,29 @@ lit_char_is_identifier_part (const uint8_t *src_p) /**< pointer to a vaild UTF8 * @return true if it is. */ bool -lit_char_is_identifier_part_character (uint16_t chr) /**< EcmaScript character */ +lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */ { /* Fast path for ASCII-defined letters. */ - if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) + if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) { - return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A - && LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z) - || (chr >= LIT_CHAR_0 && chr <= LIT_CHAR_9) - || chr == LIT_CHAR_DOLLAR_SIGN - || chr == LIT_CHAR_UNDERSCORE); + return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A + && LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z) + || (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9) + || code_point == LIT_CHAR_DOLLAR_SIGN + || code_point == LIT_CHAR_UNDERSCORE); } - return (lit_char_is_unicode_letter (chr) - || lit_char_is_unicode_non_letter_ident_part (chr)); -} /* lit_char_is_identifier_part_character */ +#if ENABLED (JERRY_ES2015) + if (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN) + { + /* TODO: detect these ranges correctly. */ + return (code_point >= 0x10C80 && code_point <= 0x10CF2); + } +#endif /* ENABLED (JERRY_ES2015) */ + + return (lit_char_is_unicode_letter ((ecma_char_t) code_point) + || lit_char_is_unicode_non_letter_ident_part ((ecma_char_t) code_point)); +} /* lit_code_point_is_identifier_part */ /** * Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2) @@ -356,30 +324,47 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to * @return length of the UTF8 representation. */ size_t -lit_char_to_utf8_bytes (uint8_t *dst_p, /**< destination buffer */ - ecma_char_t chr) /**< EcmaScript character */ +lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */ + lit_code_point_t code_point) /**< code point */ { - if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX)) + if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN) { /* 00000000 0xxxxxxx -> 0xxxxxxx */ - *dst_p = (uint8_t) chr; + dst_p[0] = (uint8_t) code_point; return 1; } - if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX)) + if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN) { /* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */ - *(dst_p++) = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_5_BITS_MASK)); - *dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK)); + dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK)); + dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); return 2; } - /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */ - *(dst_p++) = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((chr >> 12) & LIT_UTF8_LAST_4_BITS_MASK)); - *(dst_p++) = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_6_BITS_MASK)); - *dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK)); - return 3; -} /* lit_char_to_utf8_bytes */ + if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN) + { + /* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */ + dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK)); + dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK)); + dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); + return 3; + } + + JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX); + + code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN; + + dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd); + dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK)); + dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK)); + + dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd); + dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK)); + dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK)); + + return 3 * 2; +} /* lit_code_point_to_cesu8_bytes */ /** * Returns the length of the UTF8 representation of a character. @@ -387,23 +372,44 @@ lit_char_to_utf8_bytes (uint8_t *dst_p, /**< destination buffer */ * @return length of the UTF8 representation. */ size_t -lit_char_get_utf8_length (ecma_char_t chr) /**< EcmaScript character */ +lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */ { - if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX)) + if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN) { /* 00000000 0xxxxxxx */ return 1; } - if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX)) + if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN) { /* 00000yyy yyxxxxxx */ return 2; } - /* zzzzyyyy yyxxxxxx */ - return 3; -} /* lit_char_get_utf8_length */ + if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN) + { + /* zzzzyyyy yyxxxxxx */ + return 3; + } + + /* high + low surrogate */ + return 2 * 3; +} /* lit_code_point_get_cesu8_length */ + +/** + * Convert a four byte long utf8 character to two three byte long cesu8 characters + */ +void +lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */ + const uint8_t *source_p) /**< source buffer */ +{ + lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18); + code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12); + code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6); + code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK); + + lit_code_point_to_cesu8_bytes (dst_p, code_point); +} /* lit_four_byte_utf8_char_to_cesu8 */ /** * Parse the next number_of_characters hexadecimal character, diff --git a/jerry-core/lit/lit-char-helpers.h b/jerry-core/lit/lit-char-helpers.h index 08c0439a0..c1d511091 100644 --- a/jerry-core/lit/lit-char-helpers.h +++ b/jerry-core/lit/lit-char-helpers.h @@ -75,10 +75,8 @@ bool lit_char_is_line_terminator (ecma_char_t c); #define LIT_CHAR_UNDERSCORE ((ecma_char_t) '_') /* low line (underscore) */ /* LIT_CHAR_BACKSLASH defined above */ -bool lit_char_is_identifier_start (const uint8_t *src_p); -bool lit_char_is_identifier_part (const uint8_t *src_p); -bool lit_char_is_identifier_start_character (ecma_char_t chr); -bool lit_char_is_identifier_part_character (ecma_char_t chr); +bool lit_code_point_is_identifier_start (lit_code_point_t code_point); +bool lit_code_point_is_identifier_part (lit_code_point_t code_point); /* * Punctuator characters (ECMA-262 v5, 7.7) @@ -215,8 +213,9 @@ bool lit_char_is_octal_digit (ecma_char_t c); bool lit_char_is_decimal_digit (ecma_char_t c); bool lit_char_is_hex_digit (ecma_char_t c); uint32_t lit_char_hex_to_int (ecma_char_t c); -size_t lit_char_to_utf8_bytes (uint8_t *dst_p, ecma_char_t chr); -size_t lit_char_get_utf8_length (ecma_char_t chr); +size_t lit_code_point_to_cesu8_bytes (uint8_t *dst_p, lit_code_point_t code_point); +size_t lit_code_point_get_cesu8_length (lit_code_point_t code_point); +void lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, const uint8_t *source_p); /* read a hex encoded code point from a zero terminated buffer */ bool lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, lit_utf8_size_t number_of_characters, diff --git a/jerry-core/lit/lit-strings.c b/jerry-core/lit/lit-strings.c index 5b0027252..6f3b2ca09 100644 --- a/jerry-core/lit/lit-strings.c +++ b/jerry-core/lit/lit-strings.c @@ -481,7 +481,7 @@ lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer wit * @return next code unit */ ecma_char_t -lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ +lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ { JERRY_ASSERT (*buf_p); ecma_char_t ch; @@ -489,7 +489,7 @@ lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha *buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch); return ch; -} /* lit_utf8_read_next */ +} /* lit_cesu8_read_next */ /** * Decodes a unicode code unit from non-empty cesu-8-encoded buffer @@ -497,7 +497,7 @@ lit_utf8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha * @return previous code unit */ ecma_char_t -lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ +lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */ { JERRY_ASSERT (*buf_p); ecma_char_t ch; @@ -506,7 +506,7 @@ lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha lit_read_code_unit_from_utf8 (*buf_p, &ch); return ch; -} /* lit_utf8_read_prev */ +} /* lit_cesu8_read_prev */ /** * Decodes a unicode code unit from non-empty cesu-8-encoded buffer @@ -514,15 +514,15 @@ lit_utf8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with cha * @return next code unit */ ecma_char_t -lit_utf8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ +lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ { - JERRY_ASSERT (buf_p); + JERRY_ASSERT (buf_p != NULL); ecma_char_t ch; lit_read_code_unit_from_utf8 (buf_p, &ch); return ch; -} /* lit_utf8_peek_next */ +} /* lit_cesu8_peek_next */ /** * Decodes a unicode code unit from non-empty cesu-8-encoded buffer @@ -530,15 +530,15 @@ lit_utf8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with char * @return previous code unit */ ecma_char_t -lit_utf8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ +lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */ { - JERRY_ASSERT (buf_p); + JERRY_ASSERT (buf_p != NULL); ecma_char_t ch; lit_read_prev_code_unit_from_utf8 (buf_p, &ch); return ch; -} /* lit_utf8_peek_prev */ +} /* lit_cesu8_peek_prev */ /** * Increase cesu-8 encoded string pointer by one code unit. diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index 304fe7257..458ca0a00 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -46,7 +46,6 @@ #define LIT_UTF8_2_BYTE_MARKER (0xC0) #define LIT_UTF8_3_BYTE_MARKER (0xE0) #define LIT_UTF8_4_BYTE_MARKER (0xF0) -#define LIT_UTF8_5_BYTE_MARKER (0xF8) #define LIT_UTF8_EXTRA_BYTE_MARKER (0x80) #define LIT_UTF8_1_BYTE_MASK (0x80) @@ -82,7 +81,7 @@ /** * Byte values >= LIT_UTF8_FIRST_BYTE_MAX are not allowed in internal strings */ -#define LIT_UTF8_FIRST_BYTE_MAX LIT_UTF8_5_BYTE_MARKER +#define LIT_UTF8_FIRST_BYTE_MAX (0xF8) /* validation */ bool lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, lit_utf8_size_t buf_size); @@ -135,10 +134,10 @@ lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, ecma_char_t *code_point); -ecma_char_t lit_utf8_read_next (const lit_utf8_byte_t **buf_p); -ecma_char_t lit_utf8_read_prev (const lit_utf8_byte_t **buf_p); -ecma_char_t lit_utf8_peek_next (const lit_utf8_byte_t *buf_p); -ecma_char_t lit_utf8_peek_prev (const lit_utf8_byte_t *buf_p); +ecma_char_t lit_cesu8_read_next (const lit_utf8_byte_t **buf_p); +ecma_char_t lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p); +ecma_char_t lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p); +ecma_char_t lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p); void lit_utf8_incr (const lit_utf8_byte_t **buf_p); void lit_utf8_decr (const lit_utf8_byte_t **buf_p); diff --git a/jerry-core/parser/js/js-lexer.c b/jerry-core/parser/js/js-lexer.c index d46bf3664..a770c0d7f 100644 --- a/jerry-core/parser/js/js-lexer.c +++ b/jerry-core/parser/js/js-lexer.c @@ -53,14 +53,13 @@ align_column_to_tab (parser_line_counter_t column) /**< current column */ /** * Parse hexadecimal character sequence * - * @return character value + * @return character value or UINT32_MAX on error */ -ecma_char_t -lexer_hex_to_character (parser_context_t *context_p, /**< context */ - const uint8_t *source_p, /**< current source position */ - int length) /**< source length */ +static lit_code_point_t +lexer_hex_to_code_point (const uint8_t *source_p, /**< current source position */ + parser_line_counter_t length) /**< source length */ { - uint32_t result = 0; + lit_code_point_t result = 0; do { @@ -81,29 +80,94 @@ lexer_hex_to_character (parser_context_t *context_p, /**< context */ } else { - parser_raise_error (context_p, PARSER_ERR_INVALID_ESCAPE_SEQUENCE); + return UINT32_MAX; } } } while (--length > 0); - return (ecma_char_t) result; -} /* lexer_hex_to_character */ + return result; +} /* lexer_hex_to_code_point */ + +#if ENABLED (JERRY_ES2015) + +/** + * Parse hexadecimal character sequence enclosed in braces + * + * @return character value or UINT32_MAX on error + */ +static lit_code_point_t +lexer_hex_in_braces_to_code_point (const uint8_t *source_p, /**< current source position */ + const uint8_t *source_end_p, /**< source end */ + uint32_t *length_p) /**< [out] length of the sequence */ +{ + lit_code_point_t result = 0; + /* Four is the size of \u{} sequence. */ + uint32_t length = 4; + + JERRY_ASSERT (source_p[-1] == LIT_CHAR_LEFT_BRACE); + JERRY_ASSERT (source_p < source_end_p); + + do + { + uint32_t byte = *source_p++; + + result <<= 4; + + if (byte >= LIT_CHAR_0 && byte <= LIT_CHAR_9) + { + result += byte - LIT_CHAR_0; + } + else + { + byte = LEXER_TO_ASCII_LOWERCASE (byte); + if (byte >= LIT_CHAR_LOWERCASE_A && byte <= LIT_CHAR_LOWERCASE_F) + { + result += byte - (LIT_CHAR_LOWERCASE_A - 10); + } + else + { + return UINT32_MAX; + } + } + + if (result >= (LIT_UNICODE_CODE_POINT_MAX + 1) || source_p >= source_end_p) + { + return UINT32_MAX; + } + length++; + } + while (*source_p != LIT_CHAR_RIGHT_BRACE); + + *length_p = length; + return result; +} /* lexer_hex_in_braces_to_code_point */ + +#endif /* ENABLED (JERRY_ES2015) */ /** * Parse hexadecimal character sequence * * @return character value */ -static ecma_char_t -lexer_unchecked_hex_to_character (const uint8_t *source_p, /**< current source position */ - int length) /**< source length */ +static lit_code_point_t +lexer_unchecked_hex_to_character (const uint8_t **source_p) /**< [in, out] current source position */ { - uint32_t result = 0; + lit_code_point_t result = 0; + const uint8_t *char_p = *source_p; + uint32_t length = (char_p[-1] == LIT_CHAR_LOWERCASE_U) ? 4 : 2; - do +#if ENABLED (JERRY_ES2015) + if (char_p[0] == LIT_CHAR_LEFT_BRACE) { - uint32_t byte = *source_p++; + length = 0; + char_p++; + } +#endif /* ENABLED (JERRY_ES2015) */ + + while (true) + { + uint32_t byte = *char_p++; result <<= 4; @@ -118,10 +182,27 @@ lexer_unchecked_hex_to_character (const uint8_t *source_p, /**< current source p result += LEXER_TO_ASCII_LOWERCASE (byte) - (LIT_CHAR_LOWERCASE_A - 10); } - } - while (--length > 0); - return (ecma_char_t) result; + JERRY_ASSERT (result <= LIT_UNICODE_CODE_POINT_MAX); + +#if ENABLED (JERRY_ES2015) + if (length == 0) + { + if (*char_p != LIT_CHAR_RIGHT_BRACE) + { + continue; + } + *source_p = char_p + 1; + return result; + } +#endif /* ENABLED (JERRY_ES2015) */ + + if (--length == 0) + { + *source_p = char_p; + return result; + } + } } /* lexer_unchecked_hex_to_character */ /** @@ -509,102 +590,188 @@ static const uint8_t keyword_lengths_list[] = #undef LEXER_KEYWORD_LIST_LENGTH /** - * Parse identifier. + * Flags for lexer_parse_identifier. */ -static void +typedef enum +{ + LEXER_PARSE_NO_OPTS = 0, /**< no options */ + LEXER_PARSE_CHECK_KEYWORDS = (1 << 0), /**< check keywords */ + LEXER_PARSE_CHECK_START_AND_RETURN = (1 << 1), /**< check identifier start and return */ + LEXER_PARSE_CHECK_PART_AND_RETURN = (1 << 2), /**< check identifier part and return */ +} lexer_parse_options_t; + +/** + * Parse identifier. + * + * @return true, if an identifier is parsed, false otherwise + */ +static bool lexer_parse_identifier (parser_context_t *context_p, /**< context */ - bool check_keywords) /**< check keywords */ + lexer_parse_options_t options) /**< check keywords */ { /* Only very few identifiers contains \u escape sequences. */ const uint8_t *source_p = context_p->source_p; - const uint8_t *ident_start_p = context_p->source_p; /* Note: newline or tab cannot be part of an identifier. */ parser_line_counter_t column = context_p->column; const uint8_t *source_end_p = context_p->source_end_p; size_t length = 0; - - context_p->token.type = LEXER_LITERAL; - context_p->token.ident_is_strict_keyword = false; - context_p->token.lit_location.type = LEXER_IDENT_LITERAL; - context_p->token.lit_location.has_escape = false; + uint8_t has_escape = false; do { if (*source_p == LIT_CHAR_BACKSLASH) { - uint16_t character; + /* After a backslash an identifier must start. */ + lit_code_point_t code_point = UINT32_MAX; + uint32_t escape_length = 6; - context_p->token.lit_location.has_escape = true; - context_p->source_p = source_p; - context_p->token.column = column; - - if ((source_p + 6 > source_end_p) || (source_p[1] != LIT_CHAR_LOWERCASE_U)) + if (options & (LEXER_PARSE_CHECK_START_AND_RETURN | LEXER_PARSE_CHECK_PART_AND_RETURN)) { + return true; + } + + has_escape = true; + +#if ENABLED (JERRY_ES2015) + if (source_p + 5 <= source_end_p && source_p[1] == LIT_CHAR_LOWERCASE_U) + { + if (source_p[2] == LIT_CHAR_LEFT_BRACE) + { + code_point = lexer_hex_in_braces_to_code_point (source_p + 3, source_end_p, &escape_length); + } + else if (source_p + 6 <= source_end_p) + { + code_point = lexer_hex_to_code_point (source_p + 2, 4); + } + } +#else /* !ENABLED (JERRY_ES2015) */ + if (source_p + 6 <= source_end_p && source_p[1] == LIT_CHAR_LOWERCASE_U) + { + code_point = lexer_hex_to_code_point (source_p + 2, 4); + } +#endif /* ENABLED (JERRY_ES2015) */ + + if (code_point == UINT32_MAX) + { + context_p->source_p = source_p; + context_p->token.column = column; parser_raise_error (context_p, PARSER_ERR_INVALID_UNICODE_ESCAPE_SEQUENCE); } - character = lexer_hex_to_character (context_p, source_p + 2, 4); - if (length == 0) { - if (!lit_char_is_identifier_start_character (character)) + if (!lit_code_point_is_identifier_start (code_point)) { parser_raise_error (context_p, PARSER_ERR_INVALID_IDENTIFIER_START); } } else { - if (!lit_char_is_identifier_part_character (character)) + if (!lit_code_point_is_identifier_part (code_point)) { parser_raise_error (context_p, PARSER_ERR_INVALID_IDENTIFIER_PART); } } - length += lit_char_get_utf8_length (character); - source_p += 6; - PARSER_PLUS_EQUAL_LC (column, 6); + length += lit_code_point_get_cesu8_length (code_point); + source_p += escape_length; + PARSER_PLUS_EQUAL_LC (column, escape_length); continue; } - /* Valid identifiers cannot contain 4 byte long utf-8 - * characters, since those characters are represented - * by 2 ecmascript (UTF-16) characters, and those - * characters cannot be literal characters. */ - JERRY_ASSERT (source_p[0] < LEXER_UTF8_4BYTE_START); + lit_code_point_t code_point = *source_p; + lit_utf8_size_t utf8_length = 1, decoded_length = 1, char_count = 1; - source_p++; - length++; - column++; - - while (source_p < source_end_p - && IS_UTF8_INTERMEDIATE_OCTET (source_p[0])) + if (JERRY_UNLIKELY (code_point >= LIT_UTF8_2_BYTE_MARKER)) { - source_p++; - length++; - } - } - while (source_p < source_end_p - && (lit_char_is_identifier_part (source_p) || *source_p == LIT_CHAR_BACKSLASH)); + utf8_length = lit_read_code_point_from_utf8 (source_p, + (lit_utf8_size_t) (source_end_p - source_p), + &code_point); + decoded_length = utf8_length; + +#if ENABLED (JERRY_ES2015) + /* Only ES2015 supports code points outside of the basic plane which can be part of an identifier. */ + if ((code_point >= LIT_UTF16_HIGH_SURROGATE_MIN && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX) + && source_p + 3 < source_end_p) + { + lit_code_point_t low_surrogate; + lit_read_code_point_from_utf8 (source_p + 3, + (lit_utf8_size_t) (source_end_p - (source_p + 3)), + &low_surrogate); + + if (low_surrogate >= LIT_UTF16_LOW_SURROGATE_MIN && low_surrogate <= LIT_UTF16_LOW_SURROGATE_MAX) + { + code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point, + (ecma_char_t) low_surrogate); + utf8_length = 2 * 3; + decoded_length = 2 * 3; + char_count = 2; + } + } + else if (source_p[0] >= LEXER_UTF8_4BYTE_START) + { + decoded_length = 2 * 3; + has_escape = true; + } +#endif /* ENABLED (JERRY_ES2015) */ + } + + if (length == 0) + { + if (JERRY_UNLIKELY (options & (LEXER_PARSE_CHECK_START_AND_RETURN | LEXER_PARSE_CHECK_PART_AND_RETURN))) + { + if (options & LEXER_PARSE_CHECK_START_AND_RETURN) + { + return lit_code_point_is_identifier_start (code_point); + } + else + { + return lit_code_point_is_identifier_part (code_point); + } + } + + if (!lit_code_point_is_identifier_start (code_point)) + { + return false; + } + } + else if (!lit_code_point_is_identifier_part (code_point)) + { + break; + } + + source_p += utf8_length; + length += decoded_length; + PARSER_PLUS_EQUAL_LC (column, char_count); + } + while (source_p < source_end_p); + + JERRY_ASSERT (length > 0); + + context_p->token.type = LEXER_LITERAL; + context_p->token.ident_is_strict_keyword = false; + context_p->token.lit_location.type = LEXER_IDENT_LITERAL; + context_p->token.lit_location.has_escape = has_escape; - context_p->source_p = ident_start_p; context_p->token.column = context_p->column; - context_p->token.lit_location.char_p = ident_start_p; + context_p->token.lit_location.char_p = context_p->source_p; context_p->token.lit_location.length = (prop_length_t) length; - if (length > PARSER_MAXIMUM_IDENT_LENGTH) + if (JERRY_UNLIKELY (length > PARSER_MAXIMUM_IDENT_LENGTH)) { parser_raise_error (context_p, PARSER_ERR_IDENTIFIER_TOO_LONG); } /* Check keywords. */ - if (check_keywords + if ((options & LEXER_PARSE_CHECK_KEYWORDS) && (length >= LEXER_KEYWORD_MIN_LENGTH && length <= LEXER_KEYWORD_MAX_LENGTH)) { + const uint8_t *ident_start_p = context_p->source_p; uint8_t buffer_p[LEXER_KEYWORD_MAX_LENGTH]; if (JERRY_UNLIKELY (context_p->token.lit_location.has_escape)) { - lexer_convert_ident_to_cesu8 (ident_start_p, buffer_p, (prop_length_t) length); + lexer_convert_ident_to_cesu8 (buffer_p, ident_start_p, (prop_length_t) length); ident_start_p = buffer_p; } @@ -690,6 +857,7 @@ lexer_parse_identifier (parser_context_t *context_p, /**< context */ context_p->source_p = source_p; context_p->column = column; + return true; } /* lexer_parse_identifier */ /** @@ -840,20 +1008,40 @@ lexer_parse_string (parser_context_t *context_p) /**< context */ if (*source_p == LIT_CHAR_LOWERCASE_X || *source_p == LIT_CHAR_LOWERCASE_U) { - uint8_t hex_part_length = (*source_p == LIT_CHAR_LOWERCASE_X) ? 2 : 4; + uint32_t escape_length = (*source_p == LIT_CHAR_LOWERCASE_X) ? 3 : 5; + lit_code_point_t code_point = UINT32_MAX; context_p->token.line = line; context_p->token.column = (parser_line_counter_t) (column - 1); - if (source_p + 1 + hex_part_length > source_end_p) + +#if ENABLED (JERRY_ES2015) + if (source_p + 4 <= source_end_p + && source_p[0] == LIT_CHAR_LOWERCASE_U + && source_p[1] == LIT_CHAR_LEFT_BRACE) { - parser_raise_error (context_p, PARSER_ERR_INVALID_ESCAPE_SEQUENCE); + code_point = lexer_hex_in_braces_to_code_point (source_p + 2, source_end_p, &escape_length); + escape_length--; + } + else + { +#endif /* ENABLED (JERRY_ES2015) */ + if (source_p + escape_length <= source_end_p) + { + code_point = lexer_hex_to_code_point (source_p + 1, escape_length - 1); + } +#if ENABLED (JERRY_ES2015) + } +#endif /* ENABLED (JERRY_ES2015) */ + + if (code_point == UINT32_MAX) + { + parser_raise_error (context_p, PARSER_ERR_INVALID_UNICODE_ESCAPE_SEQUENCE); } - length += lit_char_get_utf8_length (lexer_hex_to_character (context_p, - source_p + 1, - hex_part_length)); - source_p += hex_part_length + 1; - PARSER_PLUS_EQUAL_LC (column, hex_part_length + 1u); + length += lit_code_point_get_cesu8_length (code_point); + + source_p += escape_length; + PARSER_PLUS_EQUAL_LC (column, escape_length); continue; } } @@ -1120,12 +1308,6 @@ lexer_parse_number (parser_context_t *context_p) /**< context */ } } - if (source_p < source_end_p - && (lit_char_is_identifier_start (source_p) || source_p[0] == LIT_CHAR_BACKSLASH)) - { - parser_raise_error (context_p, PARSER_ERR_IDENTIFIER_AFTER_NUMBER); - } - length = (size_t) (source_p - context_p->source_p); if (length > PARSER_MAXIMUM_IDENT_LENGTH) { @@ -1135,6 +1317,11 @@ lexer_parse_number (parser_context_t *context_p) /**< context */ context_p->token.lit_location.length = (prop_length_t) length; PARSER_PLUS_EQUAL_LC (context_p->column, length); context_p->source_p = source_p; + + if (source_p < source_end_p && lexer_parse_identifier (context_p, LEXER_PARSE_CHECK_START_AND_RETURN)) + { + parser_raise_error (context_p, PARSER_ERR_IDENTIFIER_AFTER_NUMBER); + } } /* lexer_parse_number */ /** @@ -1229,10 +1416,8 @@ lexer_next_token (parser_context_t *context_p) /**< context */ return; } - if (lit_char_is_identifier_start (context_p->source_p) - || context_p->source_p[0] == LIT_CHAR_BACKSLASH) + if (lexer_parse_identifier (context_p, LEXER_PARSE_CHECK_KEYWORDS)) { - lexer_parse_identifier (context_p, true); return; } @@ -1723,8 +1908,8 @@ lexer_process_char_literal (parser_context_t *context_p, /**< context */ * Convert an ident with escapes to a utf8 string. */ void -lexer_convert_ident_to_cesu8 (const uint8_t *source_p, /**< source string */ - uint8_t *destination_p, /**< destination string */ +lexer_convert_ident_to_cesu8 (uint8_t *destination_p, /**< destination string */ + const uint8_t *source_p, /**< source string */ prop_length_t length) /**< length of destination string */ { const uint8_t *destination_end_p = destination_p + length; @@ -1735,14 +1920,22 @@ lexer_convert_ident_to_cesu8 (const uint8_t *source_p, /**< source string */ { if (*source_p == LIT_CHAR_BACKSLASH) { - destination_p += lit_char_to_utf8_bytes (destination_p, - lexer_unchecked_hex_to_character (source_p + 2, 4)); - source_p += 6; + source_p += 2; + destination_p += lit_code_point_to_cesu8_bytes (destination_p, + lexer_unchecked_hex_to_character (&source_p)); continue; } - JERRY_ASSERT (IS_UTF8_INTERMEDIATE_OCTET (*source_p) - || lit_char_is_identifier_part (source_p)); +#if ENABLED (JERRY_ES2015) + if (*source_p >= LEXER_UTF8_4BYTE_START) + { + lit_four_byte_utf8_char_to_cesu8 (destination_p, source_p); + + destination_p += 6; + source_p += 4; + continue; + } +#endif /* ENABLED (JERRY_ES2015) */ *destination_p++ = *source_p++; } @@ -1783,7 +1976,7 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */ if (literal_p->type == LEXER_IDENT_LITERAL) { - lexer_convert_ident_to_cesu8 (source_p, destination_start_p, literal_p->length); + lexer_convert_ident_to_cesu8 (destination_start_p, source_p, literal_p->length); } else { @@ -1835,7 +2028,7 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */ if (*source_p >= LIT_CHAR_0 && *source_p <= LIT_CHAR_3) { - uint32_t octal_number = (uint32_t) (*source_p - LIT_CHAR_0); + lit_code_point_t octal_number = (uint32_t) (*source_p - LIT_CHAR_0); source_p++; JERRY_ASSERT (source_p < context_p->source_end_p); @@ -1854,7 +2047,7 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */ } } - destination_p += lit_char_to_utf8_bytes (destination_p, (uint16_t) octal_number); + destination_p += lit_code_point_to_cesu8_bytes (destination_p, octal_number); continue; } @@ -1878,13 +2071,9 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */ if (*source_p == LIT_CHAR_LOWERCASE_X || *source_p == LIT_CHAR_LOWERCASE_U) { - int hex_part_length = (*source_p == LIT_CHAR_LOWERCASE_X) ? 2 : 4; - JERRY_ASSERT (source_p + 1 + hex_part_length <= context_p->source_end_p); - - destination_p += lit_char_to_utf8_bytes (destination_p, - lexer_unchecked_hex_to_character (source_p + 1, - hex_part_length)); - source_p += hex_part_length + 1; + source_p++; + destination_p += lit_code_point_to_cesu8_bytes (destination_p, + lexer_unchecked_hex_to_character (&source_p)); continue; } @@ -1946,18 +2135,9 @@ lexer_construct_literal_object (parser_context_t *context_p, /**< context */ /* Processing 4 byte unicode sequence (even if it is * after a backslash). Always converted to two 3 byte * long sequence. */ + lit_four_byte_utf8_char_to_cesu8 (destination_p, source_p); - uint32_t character = ((((uint32_t) source_p[0]) & 0x7) << 18); - character |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12); - character |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6); - character |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK); - - JERRY_ASSERT (character >= 0x10000); - character -= 0x10000; - destination_p += lit_char_to_utf8_bytes (destination_p, - (ecma_char_t) (0xd800 | (character >> 10))); - destination_p += lit_char_to_utf8_bytes (destination_p, - (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK))); + destination_p += 6; source_p += 4; continue; } @@ -2376,15 +2556,14 @@ lexer_construct_regexp_object (parser_context_t *context_p, /**< context */ column++; } - if (source_p < source_end_p - && lit_char_is_identifier_part (source_p)) + context_p->source_p = source_p; + context_p->column = column; + + if (source_p < source_end_p && lexer_parse_identifier (context_p, LEXER_PARSE_CHECK_PART_AND_RETURN)) { parser_raise_error (context_p, PARSER_ERR_UNKNOWN_REGEXP_FLAG); } - context_p->source_p = source_p; - context_p->column = column; - length = (lit_utf8_size_t) (regex_end_p - regex_start_p); if (length > PARSER_MAXIMUM_STRING_LENGTH) { @@ -2473,10 +2652,9 @@ lexer_expect_identifier (parser_context_t *context_p, /**< context */ context_p->token.column = context_p->column; if (context_p->source_p < context_p->source_end_p - && (lit_char_is_identifier_start (context_p->source_p) || context_p->source_p[0] == LIT_CHAR_BACKSLASH)) + && lexer_parse_identifier (context_p, (literal_type != LEXER_STRING_LITERAL ? LEXER_PARSE_CHECK_KEYWORDS + : LEXER_PARSE_NO_OPTS))) { - lexer_parse_identifier (context_p, literal_type != LEXER_STRING_LITERAL); - if (context_p->token.type == LEXER_LITERAL) { JERRY_ASSERT (context_p->token.lit_location.type == LEXER_IDENT_LITERAL); @@ -2548,10 +2726,8 @@ lexer_expect_object_literal_id (parser_context_t *context_p, /**< context */ context_p->token.column = context_p->column; bool create_literal_object = false; - if (lit_char_is_identifier_start (context_p->source_p) || context_p->source_p[0] == LIT_CHAR_BACKSLASH) + if (lexer_parse_identifier (context_p, LEXER_PARSE_NO_OPTS)) { - lexer_parse_identifier (context_p, false); - if (!(ident_opts & (LEXER_OBJ_IDENT_ONLY_IDENTIFIERS | LEXER_OBJ_IDENT_OBJECT_PATTERN)) && context_p->token.lit_location.length == 3) { @@ -2687,10 +2863,8 @@ lexer_scan_identifier (parser_context_t *context_p, /**< context */ context_p->token.column = context_p->column; if (context_p->source_p < context_p->source_end_p - && (lit_char_is_identifier_start (context_p->source_p) || context_p->source_p[0] == LIT_CHAR_BACKSLASH)) + && lexer_parse_identifier (context_p, LEXER_PARSE_NO_OPTS)) { - lexer_parse_identifier (context_p, false); - if ((ident_opts & LEXER_SCAN_IDENT_PROPERTY) && context_p->token.lit_location.length == 3) { @@ -2726,75 +2900,135 @@ lexer_scan_identifier (parser_context_t *context_p, /**< context */ * Compares two identifiers. * * Note: - * Escape sequences are allowed, size must be the same. + * Escape sequences are allowed in the left identifier, but not in the right * * @return true if the two identifiers are the same */ -bool -lexer_compare_identifiers (const uint8_t *left_p, /**< left identifier */ - const uint8_t *right_p, /**< right identifier */ - size_t size) /**< byte size of the two identifiers */ +static bool +lexer_compare_identifier_to_chars (const uint8_t *left_p, /**< left identifier */ + const uint8_t *right_p, /**< right identifier string */ + size_t size) /**< byte size of the two identifiers */ { - uint8_t utf8_buf[3]; - size_t utf8_len, offset; + uint8_t utf8_buf[6]; do { - /* Backslash cannot be part of a multibyte UTF-8 character. */ - if (*left_p != LIT_CHAR_BACKSLASH && *right_p != LIT_CHAR_BACKSLASH) + if (*left_p == *right_p) { - if (*left_p++ != *right_p++) - { - return false; - } + left_p++; + right_p++; size--; continue; } - if (*left_p == LIT_CHAR_BACKSLASH && *right_p == LIT_CHAR_BACKSLASH) + size_t escape_size; + + if (*left_p == LIT_CHAR_BACKSLASH) { - uint16_t left_chr = lexer_unchecked_hex_to_character (left_p + 2, 4); + left_p += 2; + lit_code_point_t code_point = lexer_unchecked_hex_to_character (&left_p); - if (left_chr != lexer_unchecked_hex_to_character (right_p + 2, 4)) - { - return false; - } - - left_p += 6; - right_p += 6; - size -= lit_char_get_utf8_length (left_chr); - continue; + escape_size = lit_code_point_to_cesu8_bytes (utf8_buf, code_point); + } + else if (*left_p >= LEXER_UTF8_4BYTE_START) + { + lit_four_byte_utf8_char_to_cesu8 (utf8_buf, left_p); + escape_size = 3 * 2; + left_p += 4; + } + else + { + return false; } - /* One character is encoded as unicode sequence. */ - if (*right_p == LIT_CHAR_BACKSLASH) - { - /* The pointers can be swapped. */ - const uint8_t *swap_p = left_p; - left_p = right_p; - right_p = swap_p; - } - - utf8_len = lit_char_to_utf8_bytes (utf8_buf, lexer_unchecked_hex_to_character (left_p + 2, 4)); - JERRY_ASSERT (utf8_len > 0); - size -= utf8_len; - offset = 0; + size -= escape_size; + uint8_t *utf8_p = utf8_buf; do { - if (utf8_buf[offset] != *right_p++) + if (*right_p++ != *utf8_p++) { return false; } - offset++; } - while (offset < utf8_len); - - left_p += 6; + while (--escape_size > 0); } while (size > 0); return true; +} /* lexer_compare_identifier_to_chars */ + +/** + * Compares an identifier to a string. + * + * Note: + * Escape sequences are allowed in the left identifier, but not in the right + * + * @return true if the identifier equals to string + */ +bool +lexer_compare_identifier_to_string (const lexer_lit_location_t *left_p, /**< left literal */ + const uint8_t *right_p, /**< right identifier string */ + size_t size) /**< byte size of the right identifier */ +{ + if (left_p->length != size) + { + return false; + } + + if (!left_p->has_escape) + { + return memcmp (left_p->char_p, right_p, size) == 0; + } + + return lexer_compare_identifier_to_chars (left_p->char_p, right_p, size); +} /* lexer_compare_identifier_to_string */ + +/** + * Compares two identifiers. + * + * Note: + * Escape sequences are allowed in both identifiers + * + * @return true if the two identifiers are the same + */ +bool +lexer_compare_identifiers (parser_context_t *context_p, /**< context */ + const lexer_lit_location_t *left_p, /**< left literal */ + const lexer_lit_location_t *right_p) /**< right literal */ +{ + prop_length_t length = left_p->length; + + if (length != right_p->length) + { + return false; + } + + if (!left_p->has_escape) + { + return lexer_compare_identifier_to_chars (right_p->char_p, left_p->char_p, length); + } + + if (!right_p->has_escape) + { + return lexer_compare_identifier_to_chars (left_p->char_p, right_p->char_p, length); + } + + uint8_t buf_p[64]; + + if (length <= 64) + { + lexer_convert_ident_to_cesu8 (buf_p, left_p->char_p, length); + return lexer_compare_identifier_to_chars (right_p->char_p, buf_p, length); + } + + uint8_t *dynamic_buf_p = parser_malloc (context_p, length); + + lexer_convert_ident_to_cesu8 (dynamic_buf_p, left_p->char_p, length); + bool result = lexer_compare_identifier_to_chars (right_p->char_p, dynamic_buf_p, length); + parser_free (dynamic_buf_p, length); + + return result; } /* lexer_compare_identifiers */ /** @@ -2818,7 +3052,7 @@ lexer_current_is_literal (parser_context_t *context_p, /**< context */ if (left_ident_p->length != right_ident_p->length) { - return 0; + return false; } if (!left_ident_p->has_escape && !right_ident_p->has_escape) @@ -2826,7 +3060,7 @@ lexer_current_is_literal (parser_context_t *context_p, /**< context */ return memcmp (left_ident_p->char_p, right_ident_p->char_p, left_ident_p->length) == 0; } - return lexer_compare_identifiers (left_ident_p->char_p, right_ident_p->char_p, left_ident_p->length); + return lexer_compare_identifiers (context_p, left_ident_p, right_ident_p); } /* lexer_current_is_literal */ #if ENABLED (JERRY_ES2015) diff --git a/jerry-core/parser/js/js-parser-internal.h b/jerry-core/parser/js/js-parser-internal.h index 00346df07..84ef10d21 100644 --- a/jerry-core/parser/js/js-parser-internal.h +++ b/jerry-core/parser/js/js-parser-internal.h @@ -637,8 +637,7 @@ bool lexer_check_yield_no_arg (parser_context_t *context_p); void lexer_parse_string (parser_context_t *context_p); void lexer_expect_identifier (parser_context_t *context_p, uint8_t literal_type); void lexer_scan_identifier (parser_context_t *context_p, uint32_t ident_opts); -ecma_char_t lexer_hex_to_character (parser_context_t *context_p, const uint8_t *source_p, int length); -void lexer_convert_ident_to_cesu8 (const uint8_t *source_p, uint8_t *destination_p, prop_length_t length); +void lexer_convert_ident_to_cesu8 (uint8_t *destination_p, const uint8_t *source_p, prop_length_t length); void lexer_expect_object_literal_id (parser_context_t *context_p, uint32_t ident_opts); void lexer_construct_literal_object (parser_context_t *context_p, const lexer_lit_location_t *literal_p, uint8_t literal_type); @@ -646,7 +645,9 @@ bool lexer_construct_number_object (parser_context_t *context_p, bool is_expr, b void lexer_convert_push_number_to_push_literal (parser_context_t *context_p); uint16_t lexer_construct_function_object (parser_context_t *context_p, uint32_t extra_status_flags); void lexer_construct_regexp_object (parser_context_t *context_p, bool parse_only); -bool lexer_compare_identifiers (const uint8_t *left_p, const uint8_t *right_p, size_t size); +bool lexer_compare_identifier_to_string (const lexer_lit_location_t *left_p, const uint8_t *right_p, size_t size); +bool lexer_compare_identifiers (parser_context_t *context_p, const lexer_lit_location_t *left_p, + const lexer_lit_location_t *right_p); bool lexer_current_is_literal (parser_context_t *context_p, const lexer_lit_location_t *right_ident_p); #if ENABLED (JERRY_ES2015) bool lexer_token_is_identifier (parser_context_t *context_p, const char *identifier_p, diff --git a/jerry-core/parser/js/js-scanner-util.c b/jerry-core/parser/js/js-scanner-util.c index 5404d46af..9573e6ac6 100644 --- a/jerry-core/parser/js/js-scanner-util.c +++ b/jerry-core/parser/js/js-scanner-util.c @@ -434,8 +434,7 @@ JERRY_STATIC_ASSERT (PARSER_MAXIMUM_IDENT_LENGTH <= UINT8_MAX, static inline bool JERRY_ATTR_ALWAYS_INLINE scanner_literal_is_arguments (lexer_lit_location_t *literal_p) /**< literal */ { - return (literal_p->length == 9 - && lexer_compare_identifiers (literal_p->char_p, (const uint8_t *) "arguments", 9)); + return lexer_compare_identifier_to_string (literal_p, (const uint8_t *) "arguments", 9); } /* scanner_literal_is_arguments */ /** @@ -986,7 +985,7 @@ scanner_add_custom_literal (parser_context_t *context_p, /**< context */ return literal_p; } } - else if (lexer_compare_identifiers (literal_p->char_p, char_p, length)) + else if (lexer_compare_identifier_to_string (literal_p, char_p, length)) { /* The non-escaped version is preferred. */ literal_p->char_p = char_p; @@ -1000,8 +999,7 @@ scanner_add_custom_literal (parser_context_t *context_p, /**< context */ { while ((literal_p = (lexer_lit_location_t *) parser_list_iterator_next (&literal_iterator)) != NULL) { - if (literal_p->length == length - && lexer_compare_identifiers (literal_p->char_p, char_p, length)) + if (lexer_compare_identifiers (context_p, literal_p, literal_location_p)) { return literal_p; } @@ -1065,10 +1063,11 @@ scanner_append_argument (parser_context_t *context_p, /**< context */ scanner_literal_pool_t *literal_pool_p = scanner_context_p->active_literal_pool_p; parser_list_iterator_t literal_iterator; parser_list_iterator_init (&literal_pool_p->literal_pool, &literal_iterator); + lexer_lit_location_t *literal_location_p = &context_p->token.lit_location; lexer_lit_location_t *literal_p; - const uint8_t *char_p = context_p->token.lit_location.char_p; - prop_length_t length = context_p->token.lit_location.length; + const uint8_t *char_p = literal_location_p->char_p; + prop_length_t length = literal_location_p->length; if (JERRY_LIKELY (!context_p->token.lit_location.has_escape)) { @@ -1084,7 +1083,7 @@ scanner_append_argument (parser_context_t *context_p, /**< context */ break; } } - else if (lexer_compare_identifiers (literal_p->char_p, char_p, length)) + else if (lexer_compare_identifier_to_string (literal_p, char_p, length)) { literal_p->length = 0; break; @@ -1096,8 +1095,7 @@ scanner_append_argument (parser_context_t *context_p, /**< context */ { while ((literal_p = (lexer_lit_location_t *) parser_list_iterator_next (&literal_iterator)) != NULL) { - if (literal_p->length == length - && lexer_compare_identifiers (literal_p->char_p, char_p, length)) + if (lexer_compare_identifiers (context_p, literal_p, literal_location_p)) { literal_p->length = 0; break; @@ -1118,8 +1116,7 @@ void scanner_detect_eval_call (parser_context_t *context_p, /**< context */ scanner_context_t *scanner_context_p) /**< scanner context */ { - if (context_p->token.lit_location.length == 4 - && lexer_compare_identifiers (context_p->token.lit_location.char_p, (const uint8_t *) "eval", 4) + if (lexer_compare_identifier_to_string (&context_p->token.lit_location, (const uint8_t *) "eval", 4) && lexer_check_next_character (context_p, LIT_CHAR_LEFT_PAREN)) { scanner_context_p->active_literal_pool_p->status_flags |= SCANNER_LITERAL_POOL_NO_REG; @@ -1147,7 +1144,7 @@ scanner_scope_find_let_declaration (parser_context_t *context_p, /**< context */ { uint8_t *destination_p = (uint8_t *) scanner_malloc (context_p, literal_p->length); - lexer_convert_ident_to_cesu8 (literal_p->char_p, destination_p, literal_p->length); + lexer_convert_ident_to_cesu8 (destination_p, literal_p->char_p, literal_p->length); name_p = ecma_new_ecma_string_from_utf8 (destination_p, literal_p->length); scanner_free (destination_p, literal_p->length); @@ -1231,7 +1228,7 @@ scanner_detect_invalid_var (parser_context_t *context_p, /**< context */ return; } } - else if (lexer_compare_identifiers (literal_p->char_p, char_p, length)) + else if (lexer_compare_identifier_to_string (literal_p, char_p, length)) { scanner_raise_redeclaration_error (context_p); return; @@ -1246,8 +1243,7 @@ scanner_detect_invalid_var (parser_context_t *context_p, /**< context */ if (literal_p->type & SCANNER_LITERAL_IS_LOCAL && !(literal_p->type & SCANNER_LITERAL_IS_ARG) && (literal_p->type & SCANNER_LITERAL_IS_LOCAL) != SCANNER_LITERAL_IS_LOCAL - && literal_p->length == length - && lexer_compare_identifiers (literal_p->char_p, char_p, length)) + && lexer_compare_identifiers (context_p, literal_p, var_literal_p)) { scanner_raise_redeclaration_error (context_p); return; diff --git a/jerry-core/parser/js/js-scanner.c b/jerry-core/parser/js/js-scanner.c index 0b77aa134..2b1b40e38 100644 --- a/jerry-core/parser/js/js-scanner.c +++ b/jerry-core/parser/js/js-scanner.c @@ -376,8 +376,7 @@ scanner_handle_bracket (parser_context_t *context_p, /**< context */ arrow_source_p = NULL; #endif /* ENABLED (JERRY_ES2015) */ - if (context_p->token.lit_location.length == 4 - && lexer_compare_identifiers (context_p->token.lit_location.char_p, (const uint8_t *) "eval", 4)) + if (lexer_compare_identifier_to_string (&context_p->token.lit_location, (const uint8_t *) "eval", 4)) { scanner_context_p->active_literal_pool_p->status_flags |= SCANNER_LITERAL_POOL_NO_REG; } diff --git a/jerry-core/parser/regexp/re-compiler.c b/jerry-core/parser/regexp/re-compiler.c index b2134ac54..f82f89092 100644 --- a/jerry-core/parser/regexp/re-compiler.c +++ b/jerry-core/parser/regexp/re-compiler.c @@ -272,7 +272,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ const bool is_char_class = (re_ctx_p->current_token.type == RE_TOK_START_CHAR_CLASS || re_ctx_p->current_token.type == RE_TOK_START_INV_CHAR_CLASS); - const ecma_char_t prev_char = lit_utf8_peek_prev (parser_ctx_p->input_curr_p); + const ecma_char_t prev_char = lit_cesu8_peek_prev (parser_ctx_p->input_curr_p); if (prev_char != LIT_CHAR_LEFT_SQUARE && prev_char != LIT_CHAR_CIRCUMFLEX) { lit_utf8_decr (&parser_ctx_p->input_curr_p); @@ -286,7 +286,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string")); } - lit_code_point_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + lit_code_point_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); if (ch == LIT_CHAR_RIGHT_SQUARE) { @@ -318,7 +318,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\'")); } - ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); if (ch == LIT_CHAR_LOWERCASE_B) { @@ -376,7 +376,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ parser_ctx_p->input_curr_p += 2; if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p && is_range == false - && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) + && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) { start = code_unit; continue; @@ -396,7 +396,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ parser_ctx_p->input_curr_p += 4; if (parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p && is_range == false - && lit_utf8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) + && lit_cesu8_peek_next (parser_ctx_p->input_curr_p) == LIT_CHAR_MINUS) { start = code_unit; continue; @@ -481,7 +481,7 @@ re_parse_char_class (re_compiler_ctx_t *re_ctx_p, /**< number of classes */ && lit_is_code_point_utf16_high_surrogate (ch) && parser_ctx_p->input_curr_p < parser_ctx_p->input_end_p) { - const ecma_char_t next_ch = lit_utf8_peek_next (parser_ctx_p->input_curr_p); + const ecma_char_t next_ch = lit_cesu8_peek_next (parser_ctx_p->input_curr_p); if (lit_is_code_point_utf16_low_surrogate (next_ch)) { ch = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) ch, next_ch); diff --git a/jerry-core/parser/regexp/re-parser.c b/jerry-core/parser/regexp/re-parser.c index 981aa0c0d..01f305e1b 100644 --- a/jerry-core/parser/regexp/re-parser.c +++ b/jerry-core/parser/regexp/re-parser.c @@ -315,7 +315,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * return ret_value; } - ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + ecma_char_t ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); switch (ch) { @@ -348,7 +348,7 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context * } out_token_p->type = RE_TOK_CHAR; - ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p); + ch = lit_cesu8_read_next (&parser_ctx_p->input_curr_p); if (ch == LIT_CHAR_LOWERCASE_B) { diff --git a/tests/jerry/es2015/identifier-escape.js b/tests/jerry/es2015/identifier-escape.js new file mode 100644 index 000000000..99d63dd41 --- /dev/null +++ b/tests/jerry/es2015/identifier-escape.js @@ -0,0 +1,36 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +function check_syntax_error (code) { + try { + eval(code) + assert (false) + } catch (e) { + assert (e instanceof SyntaxError) + } +} + +eval("\u{000010C80}: break \ud803\udc80") +eval("\\u{10C80}: break \ud803\udc80") +eval("$\u{000010C80}$: break $\ud803\udc80$") +eval("$\\u{10C82}$: break $\ud803\udc82$") + +assert("\u{000010C80}".length === 2) +assert("x\u{010C80}y".length === 4) +assert("\u{10C80}" === "\ud803\u{dc80}") +assert("\u{0}\x01" === "\u0000\u0001") + +/* Surrogate pairs are not combined if they passed as \u sequences. */ +check_syntax_error("\\u{10C80}: break \\ud803\\udc80"); diff --git a/tests/unit-core/test-lit-char-helpers.c b/tests/unit-core/test-lit-char-helpers.c index c203e5c3e..7374a7e18 100644 --- a/tests/unit-core/test-lit-char-helpers.c +++ b/tests/unit-core/test-lit-char-helpers.c @@ -21,6 +21,39 @@ #include "test-common.h" +static lit_code_point_t +lexer_hex_to_character (const uint8_t *source_p) /**< current source position */ +{ + lit_code_point_t result = 0; + + do + { + uint32_t byte = *source_p++; + + result <<= 4; + + if (byte >= LIT_CHAR_0 && byte <= LIT_CHAR_9) + { + result += byte - LIT_CHAR_0; + } + else + { + byte = LEXER_TO_ASCII_LOWERCASE (byte); + if (byte >= LIT_CHAR_LOWERCASE_A && byte <= LIT_CHAR_LOWERCASE_F) + { + result += byte - (LIT_CHAR_LOWERCASE_A - 10); + } + else + { + return UINT32_MAX; + } + } + } + while (*source_p); + + return result; +} /* lexer_hex_to_character */ + int main (void) { @@ -29,50 +62,59 @@ main (void) jmem_init (); ecma_init (); - const uint8_t _1_byte_long1[] = "\\u007F"; - const uint8_t _1_byte_long2[] = "\\u0000"; - const uint8_t _1_byte_long3[] = "\\u0065"; + const uint8_t _1_byte_long1[] = "007F"; + const uint8_t _1_byte_long2[] = "0000"; + const uint8_t _1_byte_long3[] = "0065"; - const uint8_t _2_byte_long1[] = "\\u008F"; - const uint8_t _2_byte_long2[] = "\\u00FF"; - const uint8_t _2_byte_long3[] = "\\u07FF"; + const uint8_t _2_byte_long1[] = "008F"; + const uint8_t _2_byte_long2[] = "00FF"; + const uint8_t _2_byte_long3[] = "07FF"; - const uint8_t _3_byte_long1[] = "\\u08FF"; - const uint8_t _3_byte_long2[] = "\\u0FFF"; - const uint8_t _3_byte_long3[] = "\\uFFFF"; + const uint8_t _3_byte_long1[] = "08FF"; + const uint8_t _3_byte_long2[] = "0FFF"; + const uint8_t _3_byte_long3[] = "FFFF"; + + const uint8_t _6_byte_long1[] = "10000"; + const uint8_t _6_byte_long2[] = "10FFFF"; size_t length; /* Test 1-byte-long unicode sequences. */ - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long1 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long1)); TEST_ASSERT (length == 1); - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long2 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long2)); TEST_ASSERT (length == 1); - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _1_byte_long3 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_1_byte_long3)); TEST_ASSERT (length == 1); /* Test 2-byte-long unicode sequences. */ - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long1 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long1)); TEST_ASSERT (length == 2); - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long2 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long2)); TEST_ASSERT (length == 2); - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _2_byte_long3 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_2_byte_long3)); TEST_ASSERT (length == 2); /* Test 3-byte-long unicode sequences. */ - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long1 + 2, 4)); - TEST_ASSERT (length != 2); - - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long2 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long1)); TEST_ASSERT (length == 3); - length = lit_char_get_utf8_length (lexer_hex_to_character (0, _3_byte_long3 + 2, 4)); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long2)); TEST_ASSERT (length == 3); + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_3_byte_long3)); + TEST_ASSERT (length == 3); + + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long1)); + TEST_ASSERT (length == 6); + + length = lit_code_point_get_cesu8_length (lexer_hex_to_character (_6_byte_long2)); + TEST_ASSERT (length == 6); + ecma_finalize (); jmem_finalize (); diff --git a/tests/unit-core/test-strings.c b/tests/unit-core/test-strings.c index a3b8b6b03..7896d7a03 100644 --- a/tests/unit-core/test-strings.c +++ b/tests/unit-core/test-strings.c @@ -131,7 +131,7 @@ main (void) while (curr_p < end_p) { - code_units[code_units_count] = lit_utf8_peek_next (curr_p); + code_units[code_units_count] = lit_cesu8_peek_next (curr_p); saved_positions[code_units_count] = curr_p; code_units_count++; calculated_length++; @@ -147,7 +147,7 @@ main (void) { ecma_length_t index = (ecma_length_t) rand () % code_units_count; curr_p = saved_positions[index]; - TEST_ASSERT (lit_utf8_peek_next (curr_p) == code_units[index]); + TEST_ASSERT (lit_cesu8_peek_next (curr_p) == code_units[index]); } } @@ -156,7 +156,7 @@ main (void) { TEST_ASSERT (code_units_count > 0); calculated_length--; - TEST_ASSERT (code_units[calculated_length] == lit_utf8_peek_prev (curr_p)); + TEST_ASSERT (code_units[calculated_length] == lit_cesu8_peek_prev (curr_p)); lit_utf8_decr (&curr_p); } @@ -164,7 +164,7 @@ main (void) while (curr_p < end_p) { - ecma_char_t code_unit = lit_utf8_read_next (&curr_p); + ecma_char_t code_unit = lit_cesu8_read_next (&curr_p); TEST_ASSERT (code_unit == code_units[calculated_length]); calculated_length++; } @@ -175,7 +175,7 @@ main (void) { TEST_ASSERT (code_units_count > 0); calculated_length--; - TEST_ASSERT (code_units[calculated_length] == lit_utf8_read_prev (&curr_p)); + TEST_ASSERT (code_units[calculated_length] == lit_cesu8_read_prev (&curr_p)); } TEST_ASSERT (calculated_length == 0); diff --git a/tests/unit-core/test-unicode.c b/tests/unit-core/test-unicode.c new file mode 100644 index 000000000..0e6a25a15 --- /dev/null +++ b/tests/unit-core/test-unicode.c @@ -0,0 +1,61 @@ +/* Copyright JS Foundation and other contributors, http://js.foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "jerryscript.h" +#include "test-common.h" + +static bool +test_syntax_error (char *script_p) /**< script */ +{ + jerry_value_t parse_result = jerry_parse (NULL, + 0, + (const jerry_char_t *) script_p, + strlen (script_p), + JERRY_PARSE_NO_OPTS); + + bool result = false; + + if (jerry_value_is_error (parse_result)) + { + result = true; + TEST_ASSERT (jerry_get_error_type (parse_result) == JERRY_ERROR_SYNTAX); + } + + jerry_release_value (parse_result); + return result; +} /* test_syntax_error */ + +int +main (void) +{ + jerry_init (JERRY_INIT_EMPTY); + + if (!test_syntax_error ("\\u{61}")) + { + TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \\u{10C80}")); + /* The \u surrogate pairs are ignored. The \u{hex} form must be used. */ + TEST_ASSERT (test_syntax_error ("\xF0\x90\xB2\x80: break \\ud803\\udc80")); + /* The utf8 code point and the cesu8 surrogate pair must match. */ + TEST_ASSERT (!test_syntax_error ("\xF0\x90\xB2\x80: break \xed\xa0\x83\xed\xb2\x80")); + + TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\u{10C80}$")); + TEST_ASSERT (test_syntax_error ("$\xF0\x90\xB2\x80$: break $\\ud803\\udc80$")); + TEST_ASSERT (!test_syntax_error ("$\xF0\x90\xB2\x80$: break $\xed\xa0\x83\xed\xb2\x80$")); + } + + jerry_cleanup (); + + return 0; +} /* main */ diff --git a/tools/run-tests.py b/tools/run-tests.py index a175b2997..a51c2c32d 100755 --- a/tools/run-tests.py +++ b/tools/run-tests.py @@ -284,6 +284,7 @@ def create_binary(job, options): subprocess.check_output(build_cmd) ret = 0 except subprocess.CalledProcessError as err: + print(err.output) ret = err.returncode BINARY_CACHE[binary_key] = (ret, build_dir_path)