mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
Language-sensitive mappings are not processed now. Fixes #323 JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
763 lines
23 KiB
C
763 lines
23 KiB
C
/* Copyright JS Foundation and other contributors, http://js.foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "lit-char-helpers.h"
|
|
#include "lit/lit-unicode-ranges.inc.h"
|
|
#include "lit-strings.h"
|
|
|
|
#ifndef CONFIG_DISABLE_UNICODE_CASE_CONVERSION
|
|
#include "lit-unicode-conversions.inc.h"
|
|
#endif /* !CONFIG_DISABLE_UNICODE_CASE_CONVERSION */
|
|
|
|
#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
|
|
|
|
/**
|
|
* Binary search algorithm that searches the a
|
|
* character in the given char array.
|
|
*
|
|
* @return true - if the character is in the given array
|
|
* false - otherwise
|
|
*/
|
|
static bool
|
|
search_char_in_char_array (ecma_char_t c, /**< code unit */
|
|
const ecma_char_t *array, /**< array */
|
|
int size_of_array) /**< length of the array */
|
|
{
|
|
int bottom = 0;
|
|
int top = size_of_array - 1;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
ecma_char_t current = array[middle];
|
|
|
|
if (current == c)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (c < current)
|
|
{
|
|
top = middle - 1;
|
|
}
|
|
else
|
|
{
|
|
bottom = middle + 1;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
} /* search_char_in_char_array */
|
|
|
|
/**
|
|
* Binary search algorithm that searches a character in the given intervals.
|
|
* Intervals specifed by two arrays. The first one contains the starting points
|
|
* of the intervals, the second one contains the length of them.
|
|
*
|
|
* @return true - if the the character is included (inclusively) in one of the intervals in the given array
|
|
* false - otherwise
|
|
*/
|
|
static bool
|
|
search_char_in_interval_array (ecma_char_t c, /**< code unit */
|
|
const ecma_char_t *array_sp, /**< array of interval starting points */
|
|
const uint8_t *lengths, /**< array of interval lengths */
|
|
int size_of_array) /**< length of the array */
|
|
{
|
|
int bottom = 0;
|
|
int top = size_of_array - 1;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
ecma_char_t current_sp = array_sp[middle];
|
|
|
|
if (current_sp <= c && c <= current_sp + lengths[middle])
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (c > current_sp)
|
|
{
|
|
bottom = middle + 1;
|
|
}
|
|
else
|
|
{
|
|
top = middle - 1;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
} /* search_char_in_interval_array */
|
|
|
|
/**
|
|
* Check if specified character is one of the Format-Control characters
|
|
*
|
|
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 1,
|
|
* false - otherwise.
|
|
*/
|
|
bool
|
|
lit_char_is_format_control (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c == LIT_CHAR_ZWNJ
|
|
|| c == LIT_CHAR_ZWJ
|
|
|| c == LIT_CHAR_BOM);
|
|
} /* lit_char_is_format_control */
|
|
|
|
/**
|
|
* Check if specified character is one of the Whitespace characters including those
|
|
* that fall into "Space, Separator" ("Zs") Unicode character category.
|
|
*
|
|
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 2,
|
|
* false - otherwise.
|
|
*/
|
|
bool
|
|
lit_char_is_white_space (ecma_char_t c) /**< code unit */
|
|
{
|
|
if (c <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return (c == LIT_CHAR_TAB
|
|
|| c == LIT_CHAR_VTAB
|
|
|| c == LIT_CHAR_FF
|
|
|| c == LIT_CHAR_SP);
|
|
}
|
|
else
|
|
{
|
|
return (c == LIT_CHAR_NBSP
|
|
|| c == LIT_CHAR_BOM
|
|
|| (c >= unicode_separator_char_interv_sps[0]
|
|
&& c <= unicode_separator_char_interv_sps[0] + unicode_separator_char_interv_lens[0])
|
|
|| search_char_in_char_array (c, unicode_separator_chars, NUM_OF_ELEMENTS (unicode_separator_chars)));
|
|
}
|
|
} /* lit_char_is_white_space */
|
|
|
|
/**
|
|
* Check if specified character is one of LineTerminator characters
|
|
*
|
|
* @return true - if the character is one of characters, listed in ECMA-262 v5, Table 3,
|
|
* false - otherwise.
|
|
*/
|
|
bool
|
|
lit_char_is_line_terminator (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c == LIT_CHAR_LF
|
|
|| c == LIT_CHAR_CR
|
|
|| c == LIT_CHAR_LS
|
|
|| c == LIT_CHAR_PS);
|
|
} /* lit_char_is_line_terminator */
|
|
|
|
/**
|
|
* Check if specified character is a unicode letter
|
|
*
|
|
* Note:
|
|
* Unicode letter is a character, included into one of the following categories:
|
|
* - Uppercase letter (Lu);
|
|
* - Lowercase letter (Ll);
|
|
* - Titlecase letter (Lt);
|
|
* - Modifier letter (Lm);
|
|
* - Other letter (Lo);
|
|
* - Letter number (Nl).
|
|
*
|
|
* See also:
|
|
* ECMA-262 v5, 7.6
|
|
*
|
|
* @return true - if specified character falls into one of the listed categories,
|
|
* false - otherwise.
|
|
*/
|
|
static bool
|
|
lit_char_is_unicode_letter (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (search_char_in_interval_array (c, unicode_letter_interv_sps, unicode_letter_interv_lens,
|
|
NUM_OF_ELEMENTS (unicode_letter_interv_sps))
|
|
|| search_char_in_char_array (c, unicode_letter_chars, NUM_OF_ELEMENTS (unicode_letter_chars)));
|
|
} /* lit_char_is_unicode_letter */
|
|
|
|
/**
|
|
* Check if specified character is a non-letter character and can be used as a
|
|
* non-first character of an identifier.
|
|
* These characters coverd by the following unicode categories:
|
|
* - digit (Nd)
|
|
* - punctuation mark (Mn, Mc)
|
|
* - connector punctuation (Pc)
|
|
*
|
|
* See also:
|
|
* ECMA-262 v5, 7.6
|
|
*
|
|
* @return true - if specified character falls into one of the listed categories,
|
|
* false - otherwise.
|
|
*/
|
|
static bool
|
|
lit_char_is_unicode_non_letter_ident_part (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (search_char_in_interval_array (c, unicode_non_letter_ident_part_interv_sps,
|
|
unicode_non_letter_ident_part_interv_lens,
|
|
NUM_OF_ELEMENTS (unicode_non_letter_ident_part_interv_sps))
|
|
|| search_char_in_char_array (c, unicode_non_letter_ident_part_chars,
|
|
NUM_OF_ELEMENTS (unicode_non_letter_ident_part_chars)));
|
|
} /* lit_char_is_unicode_non_letter_ident_part */
|
|
|
|
/**
|
|
* Checks whether the next UTF8 character is a valid identifier start.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_char_is_identifier_start (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */
|
|
{
|
|
if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return lit_char_is_identifier_start_character (*src_p);
|
|
}
|
|
|
|
return lit_char_is_identifier_start_character (lit_utf8_peek_next (src_p));
|
|
} /* lit_char_is_identifier_start */
|
|
|
|
/**
|
|
* Checks whether the character is a valid identifier start.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_char_is_identifier_start_character (uint16_t chr) /**< EcmaScript character */
|
|
{
|
|
/* Fast path for ASCII-defined letters. */
|
|
if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A
|
|
&& LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z)
|
|
|| chr == LIT_CHAR_DOLLAR_SIGN
|
|
|| chr == LIT_CHAR_UNDERSCORE);
|
|
}
|
|
|
|
return lit_char_is_unicode_letter (chr);
|
|
} /* lit_char_is_identifier_start_character */
|
|
|
|
/**
|
|
* Checks whether the next UTF8 character is a valid identifier part.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_char_is_identifier_part (const uint8_t *src_p) /**< pointer to a vaild UTF8 character */
|
|
{
|
|
if (*src_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return lit_char_is_identifier_part_character (*src_p);
|
|
}
|
|
|
|
return lit_char_is_identifier_part_character (lit_utf8_peek_next (src_p));
|
|
} /* lit_char_is_identifier_part */
|
|
|
|
/**
|
|
* Checks whether the character is a valid identifier part.
|
|
*
|
|
* @return true if it is.
|
|
*/
|
|
bool
|
|
lit_char_is_identifier_part_character (uint16_t chr) /**< EcmaScript character */
|
|
{
|
|
/* Fast path for ASCII-defined letters. */
|
|
if (chr <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
return ((LEXER_TO_ASCII_LOWERCASE (chr) >= LIT_CHAR_LOWERCASE_A
|
|
&& LEXER_TO_ASCII_LOWERCASE (chr) <= LIT_CHAR_LOWERCASE_Z)
|
|
|| (chr >= LIT_CHAR_0 && chr <= LIT_CHAR_9)
|
|
|| chr == LIT_CHAR_DOLLAR_SIGN
|
|
|| chr == LIT_CHAR_UNDERSCORE);
|
|
}
|
|
|
|
return (lit_char_is_unicode_letter (chr)
|
|
|| lit_char_is_unicode_non_letter_ident_part (chr));
|
|
} /* lit_char_is_identifier_part_character */
|
|
|
|
/**
|
|
* Check if specified character is one of OctalDigit characters (ECMA-262 v5, B.1.2)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_octal_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c >= LIT_CHAR_ASCII_OCTAL_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_OCTAL_DIGITS_END);
|
|
} /* lit_char_is_octal_digit */
|
|
|
|
/**
|
|
* Check if specified character is one of DecimalDigit characters (ECMA-262 v5, 7.8.3)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_decimal_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END);
|
|
} /* lit_char_is_decimal_digit */
|
|
|
|
/**
|
|
* Check if specified character is one of HexDigit characters (ECMA-262 v5, 7.8.3)
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_char_is_hex_digit (ecma_char_t c) /**< code unit */
|
|
{
|
|
return ((c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|
|
|| (LEXER_TO_ASCII_LOWERCASE (c) >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
|
|
&& LEXER_TO_ASCII_LOWERCASE (c) <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END));
|
|
} /* lit_char_is_hex_digit */
|
|
|
|
/**
|
|
* Convert a HexDigit character to its numeric value, as defined in ECMA-262 v5, 7.8.3
|
|
*
|
|
* @return digit value, corresponding to the hex char
|
|
*/
|
|
uint32_t
|
|
lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to
|
|
* one of HexDigit characters */
|
|
{
|
|
JERRY_ASSERT (lit_char_is_hex_digit (c));
|
|
|
|
if (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|
|
{
|
|
return (uint32_t) (c - LIT_CHAR_ASCII_DIGITS_BEGIN);
|
|
}
|
|
else if (c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
|
|
{
|
|
return (uint32_t) (c - LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN + 10);
|
|
}
|
|
else
|
|
{
|
|
return (uint32_t) (c - LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN + 10);
|
|
}
|
|
} /* lit_char_hex_to_int */
|
|
|
|
/**
|
|
* Converts a character to UTF8 bytes.
|
|
*
|
|
* @return length of the UTF8 representation.
|
|
*/
|
|
size_t
|
|
lit_char_to_utf8_bytes (uint8_t *dst_p, /**< destination buffer */
|
|
ecma_char_t chr) /**< EcmaScript character */
|
|
{
|
|
if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX))
|
|
{
|
|
/* 00000000 0xxxxxxx -> 0xxxxxxx */
|
|
*dst_p = (uint8_t) chr;
|
|
return 1;
|
|
}
|
|
|
|
if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX))
|
|
{
|
|
/* 00000yyy yyxxxxxx -> 110yyyyy 10xxxxxx */
|
|
*(dst_p++) = (uint8_t) (LIT_UTF8_2_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_5_BITS_MASK));
|
|
*dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK));
|
|
return 2;
|
|
}
|
|
|
|
JERRY_ASSERT (!(chr & ~LIT_UTF8_3_BYTE_CODE_POINT_MAX));
|
|
/* zzzzyyyy yyxxxxxx -> 1110zzzz 10yyyyyy 10xxxxxx */
|
|
*(dst_p++) = (uint8_t) (LIT_UTF8_3_BYTE_MARKER | ((chr >> 12) & LIT_UTF8_LAST_4_BITS_MASK));
|
|
*(dst_p++) = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | ((chr >> 6) & LIT_UTF8_LAST_6_BITS_MASK));
|
|
*dst_p = (uint8_t) (LIT_UTF8_EXTRA_BYTE_MARKER | (chr & LIT_UTF8_LAST_6_BITS_MASK));
|
|
return 3;
|
|
} /* lit_char_to_utf8_bytes */
|
|
|
|
/**
|
|
* Returns the length of the UTF8 representation of a character.
|
|
*
|
|
* @return length of the UTF8 representation.
|
|
*/
|
|
size_t
|
|
lit_char_get_utf8_length (ecma_char_t chr) /**< EcmaScript character */
|
|
{
|
|
if (!(chr & ~LIT_UTF8_1_BYTE_CODE_POINT_MAX))
|
|
{
|
|
/* 00000000 0xxxxxxx */
|
|
return 1;
|
|
}
|
|
|
|
if (!(chr & ~LIT_UTF8_2_BYTE_CODE_POINT_MAX))
|
|
{
|
|
/* 00000yyy yyxxxxxx */
|
|
return 2;
|
|
}
|
|
|
|
/* zzzzyyyy yyxxxxxx */
|
|
JERRY_ASSERT (!(chr & ~LIT_UTF8_3_BYTE_CODE_POINT_MAX));
|
|
return 3;
|
|
} /* lit_char_get_utf8_length */
|
|
|
|
/**
|
|
* Parse the next number_of_characters hexadecimal character,
|
|
* and construct a code unit from them. The buffer must
|
|
* be zero terminated.
|
|
*
|
|
* @return true if decoding was successful, false otherwise
|
|
*/
|
|
bool
|
|
lit_read_code_unit_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
|
lit_utf8_size_t number_of_characters, /**< number of characters to be read */
|
|
ecma_char_ptr_t out_code_unit_p) /**< [out] decoded result */
|
|
{
|
|
ecma_char_t code_unit = LIT_CHAR_NULL;
|
|
|
|
JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);
|
|
|
|
for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
|
|
{
|
|
code_unit = (ecma_char_t) (code_unit << 4u);
|
|
|
|
if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
|
|
&& *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
|
|
{
|
|
code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
|
|
}
|
|
else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
|
|
&& *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
|
|
{
|
|
code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
|
|
}
|
|
else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
|
|
&& *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
|
|
{
|
|
code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
buf_p++;
|
|
}
|
|
|
|
*out_code_unit_p = code_unit;
|
|
return true;
|
|
} /* lit_read_code_unit_from_hex */
|
|
|
|
/**
|
|
* Check if specified character is a word character (part of IsWordChar abstract operation)
|
|
*
|
|
* See also: ECMA-262 v5, 15.10.2.6 (IsWordChar)
|
|
*
|
|
* @return true - if the character is a word character
|
|
* false - otherwise.
|
|
*/
|
|
bool
|
|
lit_char_is_word_char (ecma_char_t c) /**< code unit */
|
|
{
|
|
return ((c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END)
|
|
|| (c >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_BEGIN && c <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_END)
|
|
|| (c >= LIT_CHAR_ASCII_DIGITS_BEGIN && c <= LIT_CHAR_ASCII_DIGITS_END)
|
|
|| c == LIT_CHAR_UNDERSCORE);
|
|
} /* lit_char_is_word_char */
|
|
|
|
#ifndef CONFIG_DISABLE_UNICODE_CASE_CONVERSION
|
|
|
|
/**
|
|
* Check if the specified character is in one of those tables which contain bidirectional conversions.
|
|
*
|
|
* @return the mapped character sequence of an ecma character, if it's in the table.
|
|
* 0 - otherwise.
|
|
*/
|
|
static ecma_length_t
|
|
search_in_bidirectional_conversion_tables (ecma_char_t character, /**< code unit */
|
|
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
|
bool is_lowercase) /**< is lowercase conversion */
|
|
{
|
|
/* 1, Check if the specified character is part of the jerry_character_case_ranges table. */
|
|
int number_of_case_ranges = NUM_OF_ELEMENTS (jerry_character_case_ranges);
|
|
int conv_counter = 0;
|
|
|
|
for (int i = 0; i < number_of_case_ranges; i++)
|
|
{
|
|
if (i % 2 == 0 && i > 0)
|
|
{
|
|
conv_counter++;
|
|
}
|
|
|
|
int range_length = jerry_character_case_range_lengths[conv_counter];
|
|
ecma_char_t start_point = jerry_character_case_ranges[i];
|
|
|
|
if (start_point > character || character >= start_point + range_length)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
int char_dist = character - start_point;
|
|
|
|
if (i % 2 == 0)
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? (ecma_char_t) (jerry_character_case_ranges[i + 1] + char_dist) : character;
|
|
}
|
|
else
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (jerry_character_case_ranges[i - 1] + char_dist);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* 2, Check if the specified character is part of the character_pair_ranges table. */
|
|
int bottom = 0;
|
|
int top = NUM_OF_ELEMENTS (jerry_character_pair_ranges) - 1;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
ecma_char_t current_sp = jerry_character_pair_ranges[middle];
|
|
|
|
if (current_sp <= character && character < current_sp + jerry_character_pair_range_lengths[middle])
|
|
{
|
|
int char_dist = character - current_sp;
|
|
|
|
if ((character - current_sp) % 2 == 0)
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? (ecma_char_t) (current_sp + char_dist + 1) : character;
|
|
}
|
|
else
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? character : (ecma_char_t) (current_sp + char_dist - 1);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
if (character > current_sp)
|
|
{
|
|
bottom = middle + 1;
|
|
}
|
|
else
|
|
{
|
|
top = middle - 1;
|
|
}
|
|
}
|
|
|
|
/* 3, Check if the specified character is part of the character_pairs table. */
|
|
int number_of_character_pairs = NUM_OF_ELEMENTS (jerry_character_pairs);
|
|
|
|
for (int i = 0; i < number_of_character_pairs; i++)
|
|
{
|
|
if (character != jerry_character_pairs[i])
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (i % 2 == 0)
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? jerry_character_pairs[i + 1] : character;
|
|
}
|
|
else
|
|
{
|
|
output_buffer_p[0] = is_lowercase ? character : jerry_character_pairs[i - 1];
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
} /* search_in_bidirectional_conversion_tables */
|
|
|
|
/**
|
|
* Check if the specified character is in the given conversion table.
|
|
*
|
|
* @return the mapped character sequence of an ecma character, if it's in the table.
|
|
* 0 - otherwise.
|
|
*/
|
|
static ecma_length_t
|
|
search_in_conversion_table (ecma_char_t character, /**< code unit */
|
|
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
|
const ecma_char_t *array, /**< array */
|
|
const uint8_t *counters) /**< case_values counter */
|
|
{
|
|
int end_point = 0;
|
|
|
|
for (int i = 0; i < 3; i++)
|
|
{
|
|
int start_point = end_point;
|
|
int size_of_case_value = i + 1;
|
|
end_point += counters[i] * (size_of_case_value + 1);
|
|
|
|
int bottom = start_point;
|
|
int top = end_point - size_of_case_value;
|
|
|
|
while (bottom <= top)
|
|
{
|
|
int middle = (bottom + top) / 2;
|
|
|
|
middle -= ((middle - bottom) % (size_of_case_value + 1));
|
|
|
|
ecma_char_t current = array[middle];
|
|
|
|
if (current == character)
|
|
{
|
|
ecma_length_t char_sequence = 1;
|
|
|
|
switch (size_of_case_value)
|
|
{
|
|
case 3:
|
|
{
|
|
output_buffer_p[2] = array[middle + 3];
|
|
char_sequence++;
|
|
/* FALLTHRU */
|
|
}
|
|
case 2:
|
|
{
|
|
output_buffer_p[1] = array[middle + 2];
|
|
char_sequence++;
|
|
/* FALLTHRU */
|
|
}
|
|
default:
|
|
{
|
|
output_buffer_p[0] = array[middle + 1];
|
|
return char_sequence;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (character < current)
|
|
{
|
|
top = middle - (size_of_case_value + 1);
|
|
}
|
|
else
|
|
{
|
|
bottom = middle + (size_of_case_value + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
} /* search_in_conversion_table */
|
|
#endif /* !CONFIG_DISABLE_UNICODE_CASE_CONVERSION */
|
|
|
|
/**
|
|
* Returns the lowercase character sequence of an ecma character.
|
|
*
|
|
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
|
|
*
|
|
* @return the length of the lowercase character sequence
|
|
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
|
|
*/
|
|
ecma_length_t
|
|
lit_char_to_lower_case (ecma_char_t character, /**< input character value */
|
|
ecma_char_t *output_buffer_p, /**< [out] buffer for the result characters */
|
|
ecma_length_t buffer_size) /**< buffer size */
|
|
{
|
|
JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
|
|
|
|
if (character >= LIT_CHAR_UPPERCASE_A && character <= LIT_CHAR_UPPERCASE_Z)
|
|
{
|
|
output_buffer_p[0] = (ecma_char_t) (character + (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
|
return 1;
|
|
}
|
|
|
|
#ifndef CONFIG_DISABLE_UNICODE_CASE_CONVERSION
|
|
|
|
ecma_length_t lowercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, true);
|
|
|
|
if (lowercase_sequence != 0)
|
|
{
|
|
return lowercase_sequence;
|
|
}
|
|
|
|
int num_of_lowercase_ranges = NUM_OF_ELEMENTS (jerry_lower_case_ranges);
|
|
|
|
for (int i = 0, j = 0; i < num_of_lowercase_ranges; i += 2, j++)
|
|
{
|
|
int range_length = jerry_lower_case_range_lengths[j] - 1;
|
|
ecma_char_t start_point = jerry_lower_case_ranges[i];
|
|
|
|
if (start_point <= character && character <= start_point + range_length)
|
|
{
|
|
output_buffer_p[0] = (ecma_char_t) (jerry_lower_case_ranges[i + 1] + (character - start_point));
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
lowercase_sequence = search_in_conversion_table (character,
|
|
output_buffer_p,
|
|
jerry_lower_case_conversions,
|
|
jerry_lower_case_conversion_counters);
|
|
|
|
if (lowercase_sequence != 0)
|
|
{
|
|
return lowercase_sequence;
|
|
}
|
|
|
|
#endif /* !CONFIG_DISABLE_UNICODE_CASE_CONVERSION */
|
|
|
|
output_buffer_p[0] = character;
|
|
return 1;
|
|
} /* lit_char_to_lower_case */
|
|
|
|
/**
|
|
* Returns the uppercase character sequence of an ecma character.
|
|
*
|
|
* Note: output_buffer_p must be able to hold at least LIT_MAXIMUM_OTHER_CASE_LENGTH characters.
|
|
*
|
|
* @return the length of the uppercase character sequence
|
|
* which is always between 1 and LIT_MAXIMUM_OTHER_CASE_LENGTH.
|
|
*/
|
|
ecma_length_t
|
|
lit_char_to_upper_case (ecma_char_t character, /**< input character value */
|
|
ecma_char_t *output_buffer_p, /**< buffer for the result characters */
|
|
ecma_length_t buffer_size) /**< buffer size */
|
|
{
|
|
JERRY_ASSERT (buffer_size >= LIT_MAXIMUM_OTHER_CASE_LENGTH);
|
|
|
|
if (character >= LIT_CHAR_LOWERCASE_A && character <= LIT_CHAR_LOWERCASE_Z)
|
|
{
|
|
output_buffer_p[0] = (ecma_char_t) (character - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
|
|
return 1;
|
|
}
|
|
|
|
#ifndef CONFIG_DISABLE_UNICODE_CASE_CONVERSION
|
|
|
|
ecma_length_t uppercase_sequence = search_in_bidirectional_conversion_tables (character, output_buffer_p, false);
|
|
|
|
if (uppercase_sequence != 0)
|
|
{
|
|
return uppercase_sequence;
|
|
}
|
|
|
|
int num_of_upper_case_special_ranges = NUM_OF_ELEMENTS (jerry_upper_case_special_ranges);
|
|
|
|
for (int i = 0, j = 0; i < num_of_upper_case_special_ranges; i += 3, j++)
|
|
{
|
|
int range_length = jerry_upper_case_special_range_lengths[j];
|
|
ecma_char_t start_point = jerry_upper_case_special_ranges[i];
|
|
|
|
if (start_point <= character && character <= start_point + range_length)
|
|
{
|
|
output_buffer_p[0] = (ecma_char_t) (jerry_upper_case_special_ranges[i + 1] + (character - start_point));
|
|
output_buffer_p[1] = (ecma_char_t) (jerry_upper_case_special_ranges[i + 2]);
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
uppercase_sequence = search_in_conversion_table (character,
|
|
output_buffer_p,
|
|
jerry_upper_case_conversions,
|
|
jerry_upper_case_conversion_counters);
|
|
|
|
if (uppercase_sequence != 0)
|
|
{
|
|
return uppercase_sequence;
|
|
}
|
|
|
|
#endif /* !CONFIG_DISABLE_UNICODE_CASE_CONVERSION */
|
|
|
|
output_buffer_p[0] = character;
|
|
return 1;
|
|
} /* lit_char_to_upper_case */
|