mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
- remove all '#JERRY_ESNEXT' macro - remove 5.1 build profile, update test runner accordingly (Note: all builtins are turn on by default) - move tests from tests/jerry/esnext into tests/jerry, concatenate files with same names - add skiplist to some snapshot tests that were supported only in 5.1 - fix doxygen issues that were hidden before (bc. of es.next macro) Co-authored-by: Martin Negyokru negyokru@inf.u-szeged.hu JerryScript-DCO-1.0-Signed-off-by: Adam Szilagyi aszilagy@inf.u-szeged.hu
996 lines
34 KiB
C
996 lines
34 KiB
C
/* Copyright JS Foundation and other contributors, http://js.foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "lit-char-helpers.h"
|
|
|
|
#include "ecma-helpers.h"
|
|
|
|
#include "config.h"
|
|
#include "lit-strings.h"
|
|
#include "lit-unicode-ranges-sup.inc.h"
|
|
#include "lit-unicode-ranges.inc.h"
|
|
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
#include "lit-unicode-conversions-sup.inc.h"
|
|
#include "lit-unicode-conversions.inc.h"
|
|
#include "lit-unicode-folding.inc.h"
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
|
|
#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
|
|
|
|
/**
|
|
* Binary search algorithm that searches the a
|
|
* character in the given char array.
|
|
*
|
|
* @return true - if the character is in the given array
|
|
* false - otherwise
|
|
*/
|
|
#define LIT_SEARCH_CHAR_IN_ARRAY_FN(function_name, char_type, array_type) \
|
|
static bool function_name (char_type c, /**< code unit */ \
|
|
const array_type *array, /**< array */ \
|
|
int size_of_array) /**< length of the array */ \
|
|
{ \
|
|
int bottom = 0; \
|
|
int top = size_of_array - 1; \
|
|
\
|
|
while (bottom <= top) \
|
|
{ \
|
|
int middle = (bottom + top) / 2; \
|
|
char_type current = array[middle]; \
|
|
\
|
|
if (current == c) \
|
|
{ \
|
|
return true; \
|
|
} \
|
|
\
|
|
if (c < current) \
|
|
{ \
|
|
top = middle - 1; \
|
|
} \
|
|
else \
|
|
{ \
|
|
bottom = middle + 1; \
|
|
} \
|
|
} \
|
|
\
|
|
return false; \
|
|
} /* __function_name */
|
|
|
|
LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_char_in_array, ecma_char_t, uint16_t)
|
|
|
|
LIT_SEARCH_CHAR_IN_ARRAY_FN (lit_search_codepoint_in_array, lit_code_point_t, uint32_t)
|
|
|
|
/**
|
|
* Binary search algorithm that searches a character in the given intervals.
|
|
* Intervals specifed by two arrays. The first one contains the starting points
|
|
* of the intervals, the second one contains the length of them.
|
|
*
|
|
* @return true - if the the character is included (inclusively) in one of the intervals in the given array
|
|
* false - otherwise
|
|
*/
|
|
#define LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN(function_name, char_type, array_type, interval_type) \
|
|
static bool function_name (char_type c, /**< code unit */ \
|
|
const array_type *array_sp, /**< array of interval starting points */ \
|
|
const interval_type *lengths, /**< array of interval lengths */ \
|
|
int size_of_array) /**< length of the array */ \
|
|
{ \
|
|
int bottom = 0; \
|
|
int top = size_of_array - 1; \
|
|
\
|
|
while (bottom <= top) \
|
|
{ \
|
|
int middle = (bottom + top) / 2; \
|
|
char_type current_sp = array_sp[middle]; \
|
|
\
|
|
if (current_sp <= c && c <= current_sp + lengths[middle]) \
|
|
{ \
|
|
return true; \
|
|
} \
|
|
\
|
|
if (c > current_sp) \
|
|
{ \
|
|
bottom = middle + 1; \
|
|
} \
|
|
else \
|
|
{ \
|
|
top = middle - 1; \
|
|
} \
|
|
} \
|
|
\
|
|
return false; \
|
|
} /* function_name */
|
|
|
|
LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_char_in_interval_array, ecma_char_t, uint16_t, uint8_t)
|
|
|
|
LIT_SEARCH_CHAR_IN_INTERVAL_ARRAY_FN (lit_search_codepoint_in_interval_array, lit_code_point_t, uint32_t, uint16_t)
|
|
|
|
/**
|
|
* Check if specified character is one of the Whitespace characters including those that fall into
|
|
* "Space, Separator" ("Zs") Unicode character category or one of the Line Terminator characters.
|
|
*
|
|
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
|
|
* false - otherwise
|
|
*/
|
|
bool
|
|
lit_char_is_white_space (lit_code_point_t c) /**< code point */
|
|
{
|
|
if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return (c == LIT_CHAR_SP || (c >= LIT_CHAR_TAB && c <= LIT_CHAR_CR));
|
|
}
|
|
|
|
if (c == LIT_CHAR_BOM || c == LIT_CHAR_LS || c == LIT_CHAR_PS)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return (c <= LIT_UTF16_CODE_UNIT_MAX
|
|
&& ((c >= lit_unicode_white_space_interval_starts[0]
|
|
&& c <= lit_unicode_white_space_interval_starts[0] + lit_unicode_white_space_interval_lengths[0])
|
|
|| lit_search_char_in_array ((ecma_char_t) c,
|
|
lit_unicode_white_space_chars,
|
|
NUM_OF_ELEMENTS (lit_unicode_white_space_chars))));
|
|
} /* lit_char_is_white_space */
|
|
|
|
/**
|
|
* Check if specified character is one of LineTerminator characters
|
|
*
|
|
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
|
|
* false - otherwise
|
|
*/
|
|
bool
|
|
lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c == LIT_CHAR_LF || c == LIT_CHAR_CR || c == LIT_CHAR_LS || c == LIT_CHAR_PS);
|
|
} /* lit_char_is_line_terminator */
|
|
|
|
/**
|
|
* Check if specified character is a Unicode ID_Start
|
|
*
|
|
* See also:
|
|
* ECMA-262 v1, 11.6: UnicodeIDStart
|
|
*
|
|
* @return true - if the codepoint has Unicode property "ID_Start"
|
|
* false - otherwise
|
|
*/
|
|
static bool
|
|
lit_char_is_unicode_id_start (lit_code_point_t code_point) /**< code unit */
|
|
{
|
|
if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
|
|
{
|
|
return (lit_search_codepoint_in_interval_array (code_point,
|
|
lit_unicode_id_start_interval_starts_sup,
|
|
lit_unicode_id_start_interval_lengths_sup,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts_sup))
|
|
|| lit_search_codepoint_in_array (code_point,
|
|
lit_unicode_id_start_chars_sup,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_start_chars_sup)));
|
|
}
|
|
ecma_char_t c = (ecma_char_t) code_point;
|
|
|
|
return (lit_search_char_in_interval_array (c,
|
|
lit_unicode_id_start_interval_starts,
|
|
lit_unicode_id_start_interval_lengths,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_start_interval_starts))
|
|
|| lit_search_char_in_array (c, lit_unicode_id_start_chars, NUM_OF_ELEMENTS (lit_unicode_id_start_chars)));
|
|
} /* lit_char_is_unicode_id_start */
|
|
|
|
/**
|
|
* Check if specified character is a Unicode ID_Continue
|
|
*
|
|
* See also:
|
|
* ECMA-262 v1, 11.6: UnicodeIDContinue
|
|
*
|
|
* @return true - if the codepoint has Unicode property "ID_Continue"
|
|
* false - otherwise
|
|
*/
|
|
static bool
|
|
lit_char_is_unicode_id_continue (lit_code_point_t code_point) /**< code unit */
|
|
{
|
|
/* Each ID_Start codepoint is ID_Continue as well. */
|
|
if (lit_char_is_unicode_id_start (code_point))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (JERRY_UNLIKELY (code_point >= LIT_UTF8_4_BYTE_CODE_POINT_MIN))
|
|
{
|
|
return (lit_search_codepoint_in_interval_array (code_point,
|
|
lit_unicode_id_continue_interval_starts_sup,
|
|
lit_unicode_id_continue_interval_lengths_sup,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts_sup))
|
|
|| lit_search_codepoint_in_array (code_point,
|
|
lit_unicode_id_continue_chars_sup,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_continue_chars_sup)));
|
|
}
|
|
ecma_char_t c = (ecma_char_t) code_point;
|
|
|
|
return (
|
|
lit_search_char_in_interval_array (c,
|
|
lit_unicode_id_continue_interval_starts,
|
|
lit_unicode_id_continue_interval_lengths,
|
|
NUM_OF_ELEMENTS (lit_unicode_id_continue_interval_starts))
|
|
|| lit_search_char_in_array (c, lit_unicode_id_continue_chars, NUM_OF_ELEMENTS (lit_unicode_id_continue_chars)));
|
|
} /* lit_char_is_unicode_id_continue */
|
|
|
|
/**
|
|
* Checks whether the character is a valid identifier start.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_code_point_is_identifier_start (lit_code_point_t code_point) /**< code point */
|
|
{
|
|
/* Fast path for ASCII-defined letters. */
|
|
if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
|
|
&& LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
|
|
|| code_point == LIT_CHAR_DOLLAR_SIGN || code_point == LIT_CHAR_UNDERSCORE);
|
|
}
|
|
|
|
return lit_char_is_unicode_id_start (code_point);
|
|
} /* lit_code_point_is_identifier_start */
|
|
|
|
/**
|
|
* Checks whether the character is a valid identifier part.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_code_point_is_identifier_part (lit_code_point_t code_point) /**< code point */
|
|
{
|
|
/* Fast path for ASCII-defined letters. */
|
|
if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return ((LEXER_TO_ASCII_LOWERCASE (code_point) >= LIT_CHAR_LOWERCASE_A
|
|
&& LEXER_TO_ASCII_LOWERCASE (code_point) <= LIT_CHAR_LOWERCASE_Z)
|
|
|| (code_point >= LIT_CHAR_0 && code_point <= LIT_CHAR_9) || code_point == LIT_CHAR_DOLLAR_SIGN
|
|
|| code_point == LIT_CHAR_UNDERSCORE);
|
|
}
|
|
|
|
return lit_char_is_unicode_id_continue (code_point);
|
|
} /* lit_code_point_is_identifier_part */
|
|
|
|
/**
|
|
* Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_octal_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
|
|
} /* lit_char_is_octal_digit */
|
|
|
|
/**
|
|
* Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
|
|
} /* lit_char_is_decimal_digit */
|
|
|
|
/**
|
|
* Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_hex_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|
|
|| (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
|
|
&& LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
|
|
} /* lit_char_is_hex_digit */
|
|
|
|
/**
|
|
* Check if specified character is one of BinaryDigits characters (ECMA-262 v6, 11.8.3)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_binary_digit (ecma_char_t c) /** code unit */
|
|
{
|
|
return (c == LIT_CHAR_0 || c == LIT_CHAR_1);
|
|
} /* lit_char_is_binary_digit */
|
|
|
|
/**
|
|
* @return radix value
|
|
*/
|
|
uint8_t
|
|
lit_char_to_radix (lit_utf8_byte_t c) /** code unit */
|
|
{
|
|
switch (LEXER_TO_ASCII_LOWERCASE (c))
|
|
{
|
|
case LIT_CHAR_LOWERCASE_X:
|
|
{
|
|
return 16;
|
|
}
|
|
case LIT_CHAR_LOWERCASE_O:
|
|
{
|
|
return 8;
|
|
}
|
|
case LIT_CHAR_LOWERCASE_B:
|
|
{
|
|
return 2;
|
|
}
|
|
default:
|
|
{
|
|
return 10;
|
|
}
|
|
}
|
|
} /* lit_char_to_radix */
|
|
|
|
/**
|
|
* UnicodeEscape abstract method
|
|
*
|
|
* See also: ECMA-262 v10, 24.5.2.3
|
|
*/
|
|
void
|
|
lit_char_unicode_escape (ecma_stringbuilder_t *builder_p, /**< stringbuilder to append */
|
|
ecma_char_t c) /**< code unit to convert */
|
|
{
|
|
ecma_stringbuilder_append_raw (builder_p, (lit_utf8_byte_t *) "\\u", 2);
|
|
|
|
for (int8_t i = 3; i >= 0; i--)
|
|
{
|
|
int32_t result_char = (c >> (i * 4)) & 0xF;
|
|
ecma_stringbuilder_append_byte (
|
|
builder_p,
|
|
(lit_utf8_byte_t) (result_char + (result_char <= 9 ? LIT_CHAR_0 : (LIT_CHAR_LOWERCASE_A - 10))));
|
|
}
|
|
} /* lit_char_unicode_escape */
|
|
|
|
/**
|
|
* Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
|
|
*
|
|
* @return digit value, corresponding to the hex char
|
|
*/
|
|
uint32_t
|
|
lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
|
|
* one of HexDigit characters */
|
|
{
|
|
JERRY_ASSERT (lit_char_is_hex_digit (c));
|
|
|
|
if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|
|
{
|
|
return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
|
|
}
|
|
|
|
const uint32_t hex_offset = 10 - (LIT_CHAR_LOWERCASE_A % 32);
|
|
return (c % 32) + hex_offset;
|
|
} /* lit_char_hex_to_int */
|
|
|
|
/**
|
|
* Converts a character to UTF8 bytes.
|
|
*
|
|
* @return length of the UTF8 representation.
|
|
*/
|
|
size_t
|
|
lit_code_point_to_cesu8_bytes (uint8_t *dst_p, /**< destination buffer */
|
|
lit_code_point_t code_point) /**< code point */
|
|
{
|
|
if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* 00000000 0xxxxxxx -> 0xxxxxxx */
|
|
dst_p[0] = (uint8_t) code_point;
|
|
return 1;
|
|
}
|
|
|
|
if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
|
|
dst_p[0] = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
|
|
dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
|
|
return 2;
|
|
}
|
|
|
|
if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
|
|
dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((code_point >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
|
|
dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
|
|
dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
|
|
return 3;
|
|
}
|
|
|
|
JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
|
|
|
|
code_point -= LIT_UTF8_4_BYTE_CODE_POINT_MIN;
|
|
|
|
dst_p[0] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
|
|
dst_p[1] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x20 | ((code_point >> 16) & LIT_UTF8_LAST_4_BITS_MASK));
|
|
dst_p[2] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((code_point >> 10) & LIT_UTF8_LAST_6_BITS_MASK));
|
|
|
|
dst_p[3] = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | 0xd);
|
|
dst_p[4] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | 0x30 | ((code_point >> 6) & LIT_UTF8_LAST_4_BITS_MASK));
|
|
dst_p[5] = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (code_point & LIT_UTF8_LAST_6_BITS_MASK));
|
|
|
|
return 3 * 2;
|
|
} /* lit_code_point_to_cesu8_bytes */
|
|
|
|
/**
|
|
* Returns the length of the UTF8 representation of a character.
|
|
*
|
|
* @return length of the UTF8 representation.
|
|
*/
|
|
size_t
|
|
lit_code_point_get_cesu8_length (lit_code_point_t code_point) /**< code point */
|
|
{
|
|
if (code_point < LIT_UTF8_2_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* 00000000 0xxxxxxx */
|
|
return 1;
|
|
}
|
|
|
|
if (code_point < LIT_UTF8_3_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* 00000yyy yyxxxxxx */
|
|
return 2;
|
|
}
|
|
|
|
if (code_point < LIT_UTF8_4_BYTE_CODE_POINT_MIN)
|
|
{
|
|
/* zzzzyyyy yyxxxxxx */
|
|
return 3;
|
|
}
|
|
|
|
/* high + low surrogate */
|
|
return 2 * 3;
|
|
} /* lit_code_point_get_cesu8_length */
|
|
|
|
/**
|
|
* Convert a four byte long utf8 character to two three byte long cesu8 characters
|
|
*/
|
|
void
|
|
lit_four_byte_utf8_char_to_cesu8 (uint8_t *dst_p, /**< destination buffer */
|
|
const uint8_t *source_p) /**< source buffer */
|
|
{
|
|
lit_code_point_t code_point = ((((uint32_t) source_p[0]) & LIT_UTF8_LAST_3_BITS_MASK) << 18);
|
|
code_point |= ((((uint32_t) source_p[1]) & LIT_UTF8_LAST_6_BITS_MASK) << 12);
|
|
code_point |= ((((uint32_t) source_p[2]) & LIT_UTF8_LAST_6_BITS_MASK) << 6);
|
|
code_point |= (((uint32_t) source_p[3]) & LIT_UTF8_LAST_6_BITS_MASK);
|
|
|
|
lit_code_point_to_cesu8_bytes (dst_p, code_point);
|
|
} /* lit_four_byte_utf8_char_to_cesu8 */
|
|
|
|
/**
|
|
* Lookup hex digits in a buffer
|
|
*
|
|
* @return UINT32_MAX - if next 'lookup' number of characters do not form a valid hex number
|
|
* value of hex number, otherwise
|
|
*/
|
|
uint32_t
|
|
lit_char_hex_lookup (const lit_utf8_byte_t *buf_p, /**< buffer */
|
|
const lit_utf8_byte_t *const buf_end_p, /**< buffer end */
|
|
uint32_t lookup) /**< size of lookup */
|
|
{
|
|
JERRY_ASSERT (lookup <= 4);
|
|
|
|
if (JERRY_UNLIKELY (buf_p + lookup > buf_end_p))
|
|
{
|
|
return UINT32_MAX;
|
|
}
|
|
|
|
uint32_t value = 0;
|
|
|
|
while (lookup--)
|
|
{
|
|
lit_utf8_byte_t ch = *buf_p++;
|
|
if (!lit_char_is_hex_digit (ch))
|
|
{
|
|
return UINT32_MAX;
|
|
}
|
|
|
|
value <<= 4;
|
|
value += lit_char_hex_to_int (ch);
|
|
}
|
|
|
|
JERRY_ASSERT (value <= LIT_UTF16_CODE_UNIT_MAX);
|
|
return value;
|
|
} /* lit_char_hex_lookup */
|
|
|
|
/**
|
|
* Parse a decimal number with the value clamped to UINT32_MAX.
|
|
*
|
|
* @returns uint32_t number
|
|
*/
|
|
uint32_t
|
|
lit_parse_decimal (const lit_utf8_byte_t **buffer_p, /**< [in/out] character buffer */
|
|
const lit_utf8_byte_t *buffer_end_p) /**< buffer end */
|
|
{
|
|
const lit_utf8_byte_t *current_p = *buffer_p;
|
|
JERRY_ASSERT (lit_char_is_decimal_digit (*current_p));
|
|
|
|
uint32_t value = (uint32_t) (*current_p++ - LIT_CHAR_0);
|
|
|
|
while (current_p < buffer_end_p && lit_char_is_decimal_digit (*current_p))
|
|
{
|
|
const uint32_t digit = (uint32_t) (*current_p++ - LIT_CHAR_0);
|
|
uint32_t new_value = value * 10 + digit;
|
|
|
|
if (JERRY_UNLIKELY (value > UINT32_MAX / 10) || JERRY_UNLIKELY (new_value < value))
|
|
{
|
|
value = UINT32_MAX;
|
|
continue;
|
|
}
|
|
|
|
value = new_value;
|
|
}
|
|
|
|
*buffer_p = current_p;
|
|
return value;
|
|
} /* lit_parse_decimal */
|
|
|
|
/**
|
|
* Check if specified character is a word character (part of IsWordChar abstract operation)
|
|
*
|
|
* See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
|
|
*
|
|
* @return true - if the character is a word character
|
|
* false - otherwise
|
|
*/
|
|
bool
|
|
lit_char_is_word_char (lit_code_point_t c) /**< code point */
|
|
{
|
|
return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
|
|
|| (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
|
|
|| (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END) || c == LIT_CHAR_UNDERSCORE);
|
|
} /* lit_char_is_word_char */
|
|
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
|
|
/**
|
|
* Check if the specified character is in one of those tables which contain bidirectional conversions.
|
|
*
|
|
* @return codepoint of the converted character if it is found the the tables
|
|
* LIT_INVALID_CP - otherwise.
|
|
*/
|
|
static lit_code_point_t
|
|
lit_search_in_bidirectional_conversion_tables (lit_code_point_t cp, /**< code point */
|
|
bool is_lowercase) /**< is lowercase conversion */
|
|
{
|
|
/* 1, Check if the specified character is part of the lit_unicode_character_case_ranges_{sup} table. */
|
|
int number_of_case_ranges;
|
|
bool is_supplementary = cp > LIT_UTF16_CODE_UNIT_MAX;
|
|
|
|
if (is_supplementary)
|
|
{
|
|
number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges_sup);
|
|
}
|
|
else
|
|
{
|
|
number_of_case_ranges = NUM_OF_ELEMENTS (lit_unicode_character_case_ranges);
|
|
}
|
|
|
|
int conv_counter = 0;
|
|
|
|
for (int i = 0; i < number_of_case_ranges; i++)
|
|
{
|
|
if (i % 2 == 0 && i > 0)
|
|
{
|
|
conv_counter++;
|
|
}
|
|
|
|
size_t range_length;
|
|
lit_code_point_t start_point;
|
|
|
|
if (is_supplementary)
|
|
{
|
|
range_length = lit_unicode_character_case_range_lengths_sup[conv_counter];
|
|
start_point = lit_unicode_character_case_ranges_sup[i];
|
|
}
|
|
else
|
|
{
|
|
range_length = lit_unicode_character_case_range_lengths[conv_counter];
|
|
start_point = lit_unicode_character_case_ranges[i];
|
|
}
|
|
|
|
if (start_point > cp || cp >= start_point + range_length)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
uint32_t char_dist = (uint32_t) cp - start_point;
|
|
int offset;
|
|
if (i % 2 == 0)
|
|
{
|
|
if (!is_lowercase)
|
|
{
|
|
return cp;
|
|
}
|
|
|
|
offset = i + 1;
|
|
}
|
|
else
|
|
{
|
|
if (is_lowercase)
|
|
{
|
|
return cp;
|
|
}
|
|
|
|
offset = i - 1;
|
|
}
|
|
|
|
if (is_supplementary)
|
|
{
|
|
start_point = lit_unicode_character_case_ranges_sup[offset];
|
|
}
|
|
else
|
|
{
|
|
start_point = lit_unicode_character_case_ranges[offset];
|
|
}
|
|
|
|
return (lit_code_point_t) (start_point + char_dist);
|
|
}
|
|
|
|
/* Note: After this point based on the latest unicode standard(13.0.0.6) no conversion characters are
|
|
defined for supplementary planes */
|
|
if (is_supplementary)
|
|
{
|
|
return cp;
|
|
}
|
|
|
|
/* 2, Check if the specified character is part of the character_pair_ranges table. */
|
|
int bottom = 0;
|
|
int top = NUM_OF_ELEMENTS (lit_unicode_character_pair_ranges) - 1;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
lit_code_point_t current_sp = lit_unicode_character_pair_ranges[middle];
|
|
|
|
if (current_sp <= cp && cp < current_sp + lit_unicode_character_pair_range_lengths[middle])
|
|
{
|
|
uint32_t char_dist = (uint32_t) (cp - current_sp);
|
|
|
|
if ((cp - current_sp) % 2 == 0)
|
|
{
|
|
return is_lowercase ? (lit_code_point_t) (current_sp + char_dist + 1) : cp;
|
|
}
|
|
|
|
return is_lowercase ? cp : (lit_code_point_t) (current_sp + char_dist - 1);
|
|
}
|
|
|
|
if (cp > current_sp)
|
|
{
|
|
bottom = middle + 1;
|
|
}
|
|
else
|
|
{
|
|
top = middle - 1;
|
|
}
|
|
}
|
|
|
|
/* 3, Check if the specified character is part of the character_pairs table. */
|
|
int number_of_character_pairs = NUM_OF_ELEMENTS (lit_unicode_character_pairs);
|
|
|
|
for (int i = 0; i < number_of_character_pairs; i++)
|
|
{
|
|
if (cp != lit_unicode_character_pairs[i])
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (i % 2 == 0)
|
|
{
|
|
return is_lowercase ? lit_unicode_character_pairs[i + 1] : cp;
|
|
}
|
|
|
|
return is_lowercase ? cp : lit_unicode_character_pairs[i - 1];
|
|
}
|
|
|
|
return LIT_INVALID_CP;
|
|
} /* lit_search_in_bidirectional_conversion_tables */
|
|
|
|
/**
|
|
* Check if the specified character is in the given conversion table.
|
|
*
|
|
* @return LIT_MULTIPLE_CU if the converted character consist more than a single code unit
|
|
* converted code point - otherwise
|
|
*/
|
|
static lit_code_point_t
|
|
lit_search_in_conversion_table (ecma_char_t character, /**< code unit */
|
|
ecma_stringbuilder_t *builder_p, /**< string builder */
|
|
const ecma_char_t *array, /**< array */
|
|
const uint8_t *counters) /**< case_values counter */
|
|
{
|
|
int end_point = 0;
|
|
|
|
for (int i = 0; i < 3; i++)
|
|
{
|
|
int start_point = end_point;
|
|
int size_of_case_value = i + 1;
|
|
end_point += counters[i] * (size_of_case_value + 1);
|
|
|
|
int bottom = start_point;
|
|
int top = end_point - size_of_case_value;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
|
|
middle -= ((middle - bottom) % (size_of_case_value + 1));
|
|
|
|
ecma_char_t current = array[middle];
|
|
|
|
if (current == character)
|
|
{
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_char (builder_p, array[middle + 1]);
|
|
|
|
if (size_of_case_value > 1)
|
|
{
|
|
ecma_stringbuilder_append_char (builder_p, array[middle + 2]);
|
|
}
|
|
if (size_of_case_value > 2)
|
|
{
|
|
ecma_stringbuilder_append_char (builder_p, array[middle + 3]);
|
|
}
|
|
}
|
|
|
|
return size_of_case_value == 1 ? array[middle + 1] : LIT_MULTIPLE_CU;
|
|
}
|
|
|
|
if (character < current)
|
|
{
|
|
top = middle - (size_of_case_value + 1);
|
|
}
|
|
else
|
|
{
|
|
bottom = middle + (size_of_case_value + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_char (builder_p, character);
|
|
}
|
|
|
|
return (lit_code_point_t) character;
|
|
} /* lit_search_in_conversion_table */
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
|
|
/**
|
|
* Append the converted lowercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
|
|
*
|
|
* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
|
|
* converted code point - otherwise
|
|
*/
|
|
lit_code_point_t
|
|
lit_char_to_lower_case (lit_code_point_t cp, /**< code point */
|
|
ecma_stringbuilder_t *builder_p) /**< string builder */
|
|
{
|
|
if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
if (cp >= LIT_CHAR_UPPERCASE_A && cp <= LIT_CHAR_UPPERCASE_Z)
|
|
{
|
|
cp = (lit_utf8_byte_t) (cp + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
|
}
|
|
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
|
|
}
|
|
|
|
return cp;
|
|
}
|
|
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
lit_code_point_t lowercase_cp = lit_search_in_bidirectional_conversion_tables (cp, true);
|
|
|
|
if (lowercase_cp != LIT_INVALID_CP)
|
|
{
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
|
|
}
|
|
|
|
return lowercase_cp;
|
|
}
|
|
|
|
JERRY_ASSERT (cp < LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
|
|
|
int num_of_lowercase_ranges = NUM_OF_ELEMENTS (lit_unicode_lower_case_ranges);
|
|
|
|
for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
|
|
{
|
|
JERRY_ASSERT (lit_unicode_lower_case_range_lengths[j] > 0);
|
|
uint32_t range_length = (uint32_t) (lit_unicode_lower_case_range_lengths[j] - 1);
|
|
lit_code_point_t start_point = lit_unicode_lower_case_ranges[i];
|
|
|
|
if (start_point <= cp && cp <= start_point + range_length)
|
|
{
|
|
lowercase_cp = lit_unicode_lower_case_ranges[i + 1] + (cp - start_point);
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_codepoint (builder_p, lowercase_cp);
|
|
}
|
|
|
|
return lowercase_cp;
|
|
}
|
|
}
|
|
|
|
return lit_search_in_conversion_table ((ecma_char_t) cp,
|
|
builder_p,
|
|
lit_unicode_lower_case_conversions,
|
|
lit_unicode_lower_case_conversion_counters);
|
|
#else /* !JERRY_UNICODE_CASE_CONVERSION */
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_codepoint (builder_p, cp);
|
|
}
|
|
|
|
return cp;
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
} /* lit_char_to_lower_case */
|
|
|
|
/**
|
|
* Append the converted uppercase codeunit sequence of an a given codepoint into the stringbuilder if it is present.
|
|
*
|
|
* @return LIT_MULTIPLE_CU if the converted codepoint consist more than a single code unit
|
|
* converted code point - otherwise
|
|
*/
|
|
lit_code_point_t
|
|
lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
|
|
ecma_stringbuilder_t *builder_p) /**< string builder */
|
|
{
|
|
if (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
if (cp >= LIT_CHAR_LOWERCASE_A && cp <= LIT_CHAR_LOWERCASE_Z)
|
|
{
|
|
cp = (lit_utf8_byte_t) (cp - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
|
}
|
|
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_byte (builder_p, (lit_utf8_byte_t) cp);
|
|
}
|
|
|
|
return cp;
|
|
}
|
|
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
lit_code_point_t uppercase_cp = lit_search_in_bidirectional_conversion_tables (cp, false);
|
|
|
|
if (uppercase_cp != LIT_INVALID_CP)
|
|
{
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
|
|
}
|
|
|
|
return uppercase_cp;
|
|
}
|
|
|
|
int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (lit_unicode_upper_case_special_ranges);
|
|
|
|
for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
|
|
{
|
|
uint32_t range_length = lit_unicode_upper_case_special_range_lengths[j];
|
|
ecma_char_t start_point = lit_unicode_upper_case_special_ranges[i];
|
|
|
|
if (start_point <= cp && cp <= start_point + range_length)
|
|
{
|
|
if (builder_p != NULL)
|
|
{
|
|
uppercase_cp = lit_unicode_upper_case_special_ranges[i + 1] + (cp - start_point);
|
|
ecma_stringbuilder_append_codepoint (builder_p, uppercase_cp);
|
|
ecma_stringbuilder_append_codepoint (builder_p, lit_unicode_upper_case_special_ranges[i + 2]);
|
|
}
|
|
|
|
return LIT_MULTIPLE_CU;
|
|
}
|
|
}
|
|
|
|
return lit_search_in_conversion_table ((ecma_char_t) cp,
|
|
builder_p,
|
|
lit_unicode_upper_case_conversions,
|
|
lit_unicode_upper_case_conversion_counters);
|
|
#else /* !JERRY_UNICODE_CASE_CONVERSION */
|
|
if (builder_p != NULL)
|
|
{
|
|
ecma_stringbuilder_append_codepoint (builder_p, cp);
|
|
}
|
|
|
|
return cp;
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
} /* lit_char_to_upper_case */
|
|
|
|
/*
|
|
* Look up whether the character should be folded to the lowercase variant.
|
|
*
|
|
* @return true, if character should be lowercased
|
|
* false, otherwise
|
|
*/
|
|
bool
|
|
lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */
|
|
{
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
return (cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX || cp > LIT_UTF16_CODE_UNIT_MAX
|
|
|| (!lit_search_char_in_interval_array ((ecma_char_t) cp,
|
|
lit_unicode_folding_skip_to_lower_interval_starts,
|
|
lit_unicode_folding_skip_to_lower_interval_lengths,
|
|
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
|
|
&& !lit_search_char_in_array ((ecma_char_t) cp,
|
|
lit_unicode_folding_skip_to_lower_chars,
|
|
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
|
|
#else /* !JERRY_UNICODE_CASE_CONVERSION */
|
|
return true;
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
} /* lit_char_fold_to_lower */
|
|
|
|
/*
|
|
* Look up whether the character should be folded to the uppercase variant.
|
|
*
|
|
* @return true, if character should be uppercased
|
|
* false, otherwise
|
|
*/
|
|
bool
|
|
lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */
|
|
{
|
|
#if JERRY_UNICODE_CASE_CONVERSION
|
|
return (cp > LIT_UTF8_1_BYTE_CODE_POINT_MAX && cp <= LIT_UTF16_CODE_UNIT_MAX
|
|
&& (lit_search_char_in_interval_array ((ecma_char_t) cp,
|
|
lit_unicode_folding_to_upper_interval_starts,
|
|
lit_unicode_folding_to_upper_interval_lengths,
|
|
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
|
|
|| lit_search_char_in_array ((ecma_char_t) cp,
|
|
lit_unicode_folding_to_upper_chars,
|
|
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
|
|
#else /* !JERRY_UNICODE_CASE_CONVERSION */
|
|
return false;
|
|
#endif /* JERRY_UNICODE_CASE_CONVERSION */
|
|
} /* lit_char_fold_to_upper */
|
|
|
|
/**
|
|
* Helper method to find a specific character in a string
|
|
*
|
|
* Used by:
|
|
* ecma_builtin_string_prototype_object_replace_helper
|
|
*
|
|
* @return true - if the given character is in the string
|
|
* false - otherwise
|
|
*/
|
|
bool
|
|
lit_find_char_in_string (ecma_string_t *str_p, /**< source string */
|
|
lit_utf8_byte_t c) /**< character to find*/
|
|
{
|
|
ECMA_STRING_TO_UTF8_STRING (str_p, start_p, start_size);
|
|
|
|
const lit_utf8_byte_t *str_curr_p = start_p;
|
|
const lit_utf8_byte_t *str_end_p = start_p + start_size;
|
|
bool have_char = false;
|
|
|
|
while (str_curr_p < str_end_p)
|
|
{
|
|
if (*str_curr_p++ == c)
|
|
{
|
|
have_char = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
ECMA_FINALIZE_UTF8_STRING (start_p, start_size);
|
|
|
|
return have_char;
|
|
} /* lit_find_char_in_string */
|