mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
Change internal encoding of strings to CESU-8
JerryScript-DCO-1.0-Signed-off-by: Zsolt Borbély zsborbely.u-szeged@partner.samsung.com JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai dbatyai.u-szeged@partner.samsung.com
This commit is contained in:
parent
08c618e8c5
commit
dcd610b305
@ -414,7 +414,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
|
||||
lit_utf8_size_t string_size) /**< string size */
|
||||
{
|
||||
JERRY_ASSERT (string_p != NULL || string_size == 0);
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size));
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (string_p, string_size));
|
||||
|
||||
lit_magic_string_id_t magic_string_id;
|
||||
if (lit_is_utf8_string_magic (string_p, string_size, &magic_string_id))
|
||||
@ -444,7 +444,7 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri
|
||||
} /* ecma_new_ecma_string_from_utf8 */
|
||||
|
||||
/**
|
||||
* Allocate new ecma-string and fill it with utf-8 character which represents specified code unit
|
||||
* Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit
|
||||
*
|
||||
* @return pointer to ecma-string descriptor
|
||||
*/
|
||||
@ -627,14 +627,7 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
|
||||
jerry_fatal (ERR_OUT_OF_MEMORY);
|
||||
}
|
||||
|
||||
ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
|
||||
ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);
|
||||
|
||||
bool is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
|
||||
&& lit_is_code_unit_low_surrogate (str2_first_code_unit));
|
||||
|
||||
lit_utf8_size_t buffer_size = str1_size + str2_size - (lit_utf8_size_t) (is_surrogate_pair_sliced ?
|
||||
LIT_UTF8_CESU8_SURROGATE_SIZE_DIF : 0);
|
||||
lit_utf8_size_t buffer_size = str1_size + str2_size;
|
||||
|
||||
lit_utf8_byte_t *str_p = (lit_utf8_byte_t *) mem_heap_alloc_block (buffer_size, MEM_HEAP_ALLOC_SHORT_TERM);
|
||||
|
||||
@ -643,23 +636,9 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
|
||||
bytes_copied1 = ecma_string_to_utf8_string (string1_p, str_p, (ssize_t) str1_size);
|
||||
JERRY_ASSERT (bytes_copied1 > 0);
|
||||
|
||||
if (!is_surrogate_pair_sliced)
|
||||
{
|
||||
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
|
||||
JERRY_ASSERT (bytes_copied2 > 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
bytes_copied2 = ecma_string_to_utf8_string (string2_p,
|
||||
str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + 1,
|
||||
(ssize_t) buffer_size - bytes_copied1
|
||||
+ LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
|
||||
JERRY_ASSERT (bytes_copied2 > 0);
|
||||
bytes_copied2 = ecma_string_to_utf8_string (string2_p, str_p + str1_size, (ssize_t) str2_size);
|
||||
JERRY_ASSERT (bytes_copied2 > 0);
|
||||
|
||||
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
|
||||
str2_first_code_unit);
|
||||
lit_code_point_to_utf8 (surrogate_code_point, str_p + str1_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
|
||||
}
|
||||
ecma_string_t *str_concat_p = ecma_new_ecma_string_from_utf8 (str_p, buffer_size);
|
||||
|
||||
mem_heap_free_block ((void*) str_p);
|
||||
@ -955,7 +934,7 @@ ecma_string_get_array_index (const ecma_string_t *str_p, /**< ecma-string */
|
||||
} /* ecma_string_is_array_index */
|
||||
|
||||
/**
|
||||
* Convert ecma-string's contents to a utf-8 string and put it to the buffer.
|
||||
* Convert ecma-string's contents to a cesu-8 string and put it to the buffer.
|
||||
*
|
||||
* @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
|
||||
* otherwise (in case size of buffer is insufficient) - negative number, which is calculated
|
||||
@ -1018,7 +997,6 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case ECMA_STRING_CONTAINER_MAGIC_STRING:
|
||||
{
|
||||
const lit_magic_string_id_t id = string_desc_p->u.magic_string_id;
|
||||
@ -1491,7 +1469,7 @@ ecma_string_get_char_at_pos (const ecma_string_t *string_p, /**< ecma-string */
|
||||
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
|
||||
JERRY_ASSERT (sz > 0);
|
||||
|
||||
ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);;
|
||||
ch = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, index);
|
||||
|
||||
MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);
|
||||
|
||||
@ -1682,10 +1660,7 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
|
||||
JERRY_ASSERT (end_pos <= string_length);
|
||||
#endif
|
||||
|
||||
const ecma_length_t span = (start_pos > end_pos) ? 0 : end_pos - start_pos;
|
||||
const lit_utf8_size_t utf8_str_size = LIT_UTF8_MAX_BYTES_IN_CODE_UNIT * span;
|
||||
|
||||
if (utf8_str_size)
|
||||
if (start_pos < end_pos)
|
||||
{
|
||||
/**
|
||||
* I. Dump original string to plain buffer
|
||||
@ -1701,20 +1676,22 @@ ecma_string_substr (const ecma_string_t *string_p, /**< pointer to an ecma strin
|
||||
/**
|
||||
* II. Extract substring
|
||||
*/
|
||||
MEM_DEFINE_LOCAL_ARRAY (utf8_substr_buffer, utf8_str_size, lit_utf8_byte_t);
|
||||
lit_utf8_byte_t *start_p = utf8_str_p;
|
||||
end_pos -= start_pos;
|
||||
|
||||
lit_utf8_size_t utf8_substr_buffer_offset = 0;
|
||||
for (ecma_length_t idx = 0; idx < span; idx++)
|
||||
while (start_pos--)
|
||||
{
|
||||
ecma_char_t code_unit = lit_utf8_string_code_unit_at (utf8_str_p, buffer_size, start_pos + idx);
|
||||
|
||||
JERRY_ASSERT (utf8_str_size >= utf8_substr_buffer_offset + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
|
||||
utf8_substr_buffer_offset += lit_code_unit_to_utf8 (code_unit, utf8_substr_buffer + utf8_substr_buffer_offset);
|
||||
start_p += lit_get_unicode_char_size_by_utf8_first_byte (*start_p);
|
||||
}
|
||||
|
||||
ecma_string_p = ecma_new_ecma_string_from_utf8 (utf8_substr_buffer, utf8_substr_buffer_offset);
|
||||
lit_utf8_byte_t *end_p = start_p;
|
||||
while (end_pos--)
|
||||
{
|
||||
end_p += lit_get_unicode_char_size_by_utf8_first_byte (*end_p);
|
||||
}
|
||||
|
||||
ecma_string_p = ecma_new_ecma_string_from_utf8 (start_p, (lit_utf8_size_t) (end_p - start_p));
|
||||
|
||||
MEM_FINALIZE_LOCAL_ARRAY (utf8_substr_buffer);
|
||||
MEM_FINALIZE_LOCAL_ARRAY (utf8_str_p);
|
||||
|
||||
return ecma_string_p;
|
||||
@ -1746,47 +1723,47 @@ ecma_string_trim (const ecma_string_t *string_p) /**< pointer to an ecma string
|
||||
ssize_t sz = ecma_string_to_utf8_string (string_p, utf8_str_p, (ssize_t) buffer_size);
|
||||
JERRY_ASSERT (sz >= 0);
|
||||
|
||||
lit_utf8_iterator_t front = lit_utf8_iterator_create (utf8_str_p, buffer_size);
|
||||
|
||||
lit_utf8_iterator_t back = lit_utf8_iterator_create (utf8_str_p, buffer_size);
|
||||
lit_utf8_iterator_seek_eos (&back);
|
||||
|
||||
lit_utf8_iterator_pos_t start = lit_utf8_iterator_get_pos (&back);
|
||||
lit_utf8_iterator_pos_t end = lit_utf8_iterator_get_pos (&front);
|
||||
|
||||
ecma_char_t current;
|
||||
ecma_char_t ch;
|
||||
lit_utf8_size_t read_size;
|
||||
lit_utf8_byte_t *nonws_start_p = utf8_str_p + buffer_size;
|
||||
lit_utf8_byte_t *current_p = utf8_str_p;
|
||||
|
||||
/* Trim front. */
|
||||
while (!lit_utf8_iterator_is_eos (&front))
|
||||
while (current_p < nonws_start_p)
|
||||
{
|
||||
current = lit_utf8_iterator_read_next (&front);
|
||||
if (!lit_char_is_white_space (current)
|
||||
&& !lit_char_is_line_terminator (current))
|
||||
read_size = lit_read_code_unit_from_utf8 (current_p, &ch);
|
||||
|
||||
if (!lit_char_is_white_space (ch)
|
||||
&& !lit_char_is_line_terminator (ch))
|
||||
{
|
||||
lit_utf8_iterator_decr (&front);
|
||||
start = lit_utf8_iterator_get_pos (&front);
|
||||
nonws_start_p = current_p;
|
||||
break;
|
||||
}
|
||||
|
||||
current_p += read_size;
|
||||
}
|
||||
|
||||
current_p = utf8_str_p + buffer_size;
|
||||
|
||||
/* Trim back. */
|
||||
while (!lit_utf8_iterator_is_bos (&back))
|
||||
while (current_p > utf8_str_p)
|
||||
{
|
||||
current = lit_utf8_iterator_read_prev (&back);
|
||||
if (!lit_char_is_white_space (current)
|
||||
&& !lit_char_is_line_terminator (current))
|
||||
read_size = lit_read_prev_code_unit_from_utf8 (current_p, &ch);
|
||||
|
||||
if (!lit_char_is_white_space (ch)
|
||||
&& !lit_char_is_line_terminator (ch))
|
||||
{
|
||||
lit_utf8_iterator_incr (&back);
|
||||
end = lit_utf8_iterator_get_pos (&back);
|
||||
break;
|
||||
}
|
||||
|
||||
current_p -= read_size;
|
||||
}
|
||||
|
||||
/* Construct new string. */
|
||||
if (end.offset > start.offset)
|
||||
if (current_p > nonws_start_p)
|
||||
{
|
||||
ret_string_p = ecma_new_ecma_string_from_utf8 (utf8_str_p + start.offset,
|
||||
(lit_utf8_size_t) (end.offset - start.offset));
|
||||
ret_string_p = ecma_new_ecma_string_from_utf8 (nonws_start_p,
|
||||
(lit_utf8_size_t) (current_p - nonws_start_p));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@ -97,11 +97,13 @@ ecma_builtin_function_helper_get_arguments (const ecma_value_t *arguments_list_p
|
||||
ssize_t sz = ecma_string_to_utf8_string (str_p, start_p, (ssize_t) str_size);
|
||||
JERRY_ASSERT (sz >= 0);
|
||||
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (start_p, str_size);
|
||||
lit_utf8_byte_t *current_p = start_p;
|
||||
const lit_utf8_byte_t *string_end_p = start_p + str_size;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter))
|
||||
while (current_p < string_end_p)
|
||||
{
|
||||
ecma_char_t current_char = lit_utf8_iterator_read_next (&iter);
|
||||
ecma_char_t current_char;
|
||||
current_p += lit_read_code_unit_from_utf8 (current_p, ¤t_char);
|
||||
|
||||
if (current_char == ',')
|
||||
{
|
||||
@ -197,33 +199,36 @@ ecma_builtin_function_dispatch_construct (const ecma_value_t *arguments_list_p,
|
||||
ssize_t sz = ecma_string_to_utf8_string (arguments_str_p, start_p, (ssize_t) str_size);
|
||||
JERRY_ASSERT (sz >= 0);
|
||||
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (start_p, str_size);
|
||||
ecma_length_t last_separator = lit_utf8_iterator_get_index (&iter);
|
||||
ecma_length_t end_position;
|
||||
lit_utf8_byte_t *current_p = start_p;
|
||||
lit_utf8_byte_t *last_separator = start_p;
|
||||
lit_utf8_byte_t *end_position;
|
||||
const lit_utf8_byte_t *string_end_p = start_p + str_size;
|
||||
ecma_string_t *param_str_p;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter))
|
||||
while (current_p < string_end_p)
|
||||
{
|
||||
ecma_char_t current_char = lit_utf8_iterator_read_next (&iter);
|
||||
ecma_char_t current_char;
|
||||
lit_utf8_size_t read_size = lit_read_code_unit_from_utf8 (current_p, ¤t_char);
|
||||
|
||||
if (current_char == ',')
|
||||
{
|
||||
lit_utf8_iterator_decr (&iter);
|
||||
end_position = lit_utf8_iterator_get_index (&iter);
|
||||
end_position = current_p;
|
||||
|
||||
param_str_p = ecma_string_substr (arguments_str_p, last_separator, end_position);
|
||||
param_str_p = ecma_new_ecma_string_from_utf8 (last_separator,
|
||||
(lit_utf8_size_t) (end_position - last_separator));
|
||||
string_params_p[params_count] = ecma_string_trim (param_str_p);
|
||||
ecma_deref_ecma_string (param_str_p);
|
||||
|
||||
lit_utf8_iterator_incr (&iter);
|
||||
last_separator = lit_utf8_iterator_get_index (&iter);
|
||||
|
||||
last_separator = current_p + read_size;
|
||||
params_count++;
|
||||
}
|
||||
|
||||
current_p += read_size;
|
||||
}
|
||||
|
||||
end_position = lit_utf8_string_length (start_p, str_size);
|
||||
param_str_p = ecma_string_substr (arguments_str_p, last_separator, end_position);
|
||||
end_position = (lit_utf8_byte_t *) string_end_p;
|
||||
param_str_p = ecma_new_ecma_string_from_utf8 (last_separator,
|
||||
(lit_utf8_size_t) (end_position - last_separator));
|
||||
string_params_p[params_count] = ecma_string_trim (param_str_p);
|
||||
ecma_deref_ecma_string (param_str_p);
|
||||
params_count++;
|
||||
|
||||
@ -852,6 +852,10 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
|
||||
output_size++;
|
||||
}
|
||||
}
|
||||
else if ((decoded_byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
|
||||
{
|
||||
output_size += 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_size++;
|
||||
@ -861,27 +865,23 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
MEM_DEFINE_LOCAL_ARRAY (output_start_p,
|
||||
output_size * 2,
|
||||
output_size,
|
||||
lit_utf8_byte_t);
|
||||
|
||||
input_char_p = input_start_p;
|
||||
lit_utf8_byte_t *output_char_p = output_start_p;
|
||||
lit_utf8_byte_t *output_type_p = output_start_p + output_size;
|
||||
|
||||
while (input_char_p < input_end_p)
|
||||
{
|
||||
/* Input decode. */
|
||||
if (*input_char_p != '%')
|
||||
{
|
||||
*output_type_p++ = URI_DECODE_ORIGINAL_BYTE;
|
||||
*output_char_p = *input_char_p;
|
||||
output_char_p++;
|
||||
input_char_p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
*output_type_p++ = URI_DECODE_DECODED_BYTE;
|
||||
|
||||
lit_code_point_t decoded_byte;
|
||||
|
||||
lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte);
|
||||
@ -898,68 +898,95 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
|
||||
}
|
||||
else
|
||||
{
|
||||
*output_char_p = (lit_utf8_byte_t) decoded_byte;
|
||||
output_char_p++;
|
||||
*output_char_p++ = (lit_utf8_byte_t) decoded_byte;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
*output_char_p = (lit_utf8_byte_t) decoded_byte;
|
||||
output_char_p++;
|
||||
}
|
||||
}
|
||||
uint32_t bytes_count;
|
||||
|
||||
JERRY_ASSERT (output_start_p + output_size == output_char_p);
|
||||
|
||||
bool valid_utf8 = lit_is_utf8_string_valid (output_start_p, output_size);
|
||||
|
||||
if (valid_utf8)
|
||||
{
|
||||
lit_utf8_iterator_t characters = lit_utf8_iterator_create (output_start_p, output_size);
|
||||
output_type_p = output_start_p + output_size;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&characters))
|
||||
{
|
||||
bool original_byte = output_type_p[characters.buf_pos.offset] == URI_DECODE_ORIGINAL_BYTE;
|
||||
|
||||
ecma_char_t character = lit_utf8_iterator_read_next (&characters);
|
||||
|
||||
/* Surrogate fragments are allowed in JS, but not accepted by URI decoding. */
|
||||
if (!original_byte)
|
||||
if ((decoded_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
||||
{
|
||||
if (lit_is_code_unit_high_surrogate (character))
|
||||
{
|
||||
/* Note: stray high/low surrogate pairs are not allowed in the stream. */
|
||||
if (lit_utf8_iterator_is_eos (&characters))
|
||||
{
|
||||
valid_utf8 = false;
|
||||
break;
|
||||
}
|
||||
bytes_count = 2;
|
||||
}
|
||||
else if ((decoded_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
|
||||
{
|
||||
bytes_count = 3;
|
||||
}
|
||||
else if ((decoded_byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
|
||||
{
|
||||
bytes_count = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
|
||||
if (output_type_p[characters.buf_pos.offset] == URI_DECODE_ORIGINAL_BYTE
|
||||
|| !lit_is_code_unit_low_surrogate (lit_utf8_iterator_read_next (&characters)))
|
||||
{
|
||||
valid_utf8 = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (lit_is_code_unit_low_surrogate (character))
|
||||
lit_utf8_byte_t octets[LIT_UTF8_MAX_BYTES_IN_CODE_POINT];
|
||||
octets[0] = (lit_utf8_byte_t) decoded_byte;
|
||||
bool is_valid = true;
|
||||
|
||||
for (uint32_t i = 1; i < bytes_count; i++)
|
||||
{
|
||||
if (input_char_p >= input_end_p || *input_char_p != '%')
|
||||
{
|
||||
valid_utf8 = false;
|
||||
is_valid = false;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
lit_code_point_t cp;
|
||||
lit_read_code_point_from_hex (input_char_p + 1, 2, &cp);
|
||||
|
||||
if ((cp & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
||||
{
|
||||
is_valid = false;
|
||||
break;
|
||||
}
|
||||
|
||||
octets[i] = (lit_utf8_byte_t) cp;
|
||||
input_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_valid)
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
|
||||
lit_code_point_t cp;
|
||||
lit_read_code_point_from_utf8 (octets, bytes_count, &cp);
|
||||
|
||||
if ((bytes_count == 2 && cp <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
|| (bytes_count == 3 && cp <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
|
||||
|| (bytes_count == 4 && cp <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
|
||||
|| lit_is_code_unit_high_surrogate ((ecma_char_t) cp)
|
||||
|| lit_is_code_unit_low_surrogate ((ecma_char_t) cp)
|
||||
|| cp > LIT_UNICODE_CODE_POINT_MAX)
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
|
||||
output_char_p += lit_code_point_to_cesu8 (cp, output_char_p);
|
||||
}
|
||||
}
|
||||
|
||||
if (valid_utf8)
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
{
|
||||
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
|
||||
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
JERRY_ASSERT (output_start_p + output_size == output_char_p);
|
||||
|
||||
if (lit_is_cesu8_string_valid (output_start_p, output_size))
|
||||
{
|
||||
ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (output_start_p, output_size);
|
||||
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (output_string_p));
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
}
|
||||
}
|
||||
|
||||
MEM_FINALIZE_LOCAL_ARRAY (output_start_p);
|
||||
@ -1056,20 +1083,53 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
|
||||
*/
|
||||
|
||||
lit_utf8_byte_t *input_char_p = input_start_p;
|
||||
lit_utf8_byte_t *input_end_p = input_start_p + input_size;
|
||||
const lit_utf8_byte_t *input_end_p = input_start_p + input_size;
|
||||
lit_utf8_size_t output_length = 0;
|
||||
lit_code_point_t cp;
|
||||
ecma_char_t ch;
|
||||
lit_utf8_byte_t octets[LIT_UTF8_MAX_BYTES_IN_CODE_POINT];
|
||||
|
||||
while (input_char_p < input_end_p)
|
||||
{
|
||||
/*
|
||||
* We expect that the input is a valid UTF-8 sequence,
|
||||
* so we only need to reject stray surrogate pairs.
|
||||
*/
|
||||
/* Input validation, we need to reject stray surrogates. */
|
||||
input_char_p += lit_read_code_unit_from_utf8 (input_char_p, &ch);
|
||||
|
||||
/* Input validation. */
|
||||
if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
if (lit_is_code_unit_low_surrogate (ch))
|
||||
{
|
||||
if (ecma_builtin_global_object_character_is_in (*input_char_p, unescaped_uri_bitset_p))
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
|
||||
cp = ch;
|
||||
|
||||
if (lit_is_code_unit_high_surrogate (ch))
|
||||
{
|
||||
if (input_char_p == input_end_p)
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
|
||||
ecma_char_t next_ch;
|
||||
lit_utf8_size_t read_size = lit_read_code_unit_from_utf8 (input_char_p, &next_ch);
|
||||
|
||||
if (lit_is_code_unit_low_surrogate (next_ch))
|
||||
{
|
||||
cp = lit_convert_surrogate_pair_to_code_point (ch, next_ch);
|
||||
input_char_p += read_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lit_utf8_size_t utf_size = lit_code_point_to_utf8 (cp, octets);
|
||||
|
||||
if (utf_size == 1)
|
||||
{
|
||||
if (ecma_builtin_global_object_character_is_in (octets[0], unescaped_uri_bitset_p))
|
||||
{
|
||||
output_length++;
|
||||
}
|
||||
@ -1078,28 +1138,10 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
|
||||
output_length += URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
}
|
||||
else if (*input_char_p == (LIT_UTF8_3_BYTE_MARKER + (LIT_UTF16_HIGH_SURROGATE_MARKER >> 12)))
|
||||
{
|
||||
/* The next character is in the [0xd000, 0xdfff] range. */
|
||||
output_length += URI_ENCODED_BYTE_SIZE;
|
||||
input_char_p++;
|
||||
JERRY_ASSERT (input_char_p < input_end_p);
|
||||
JERRY_ASSERT ((*input_char_p & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||
|
||||
/* If this condition is true, the next character is >= LIT_UTF16_HIGH_SURROGATE_MIN. */
|
||||
if (*input_char_p & 0x20)
|
||||
{
|
||||
ret_value = ecma_make_throw_obj_completion_value (ecma_new_standard_error (ECMA_ERROR_URI));
|
||||
break;
|
||||
}
|
||||
output_length += URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
else
|
||||
{
|
||||
output_length += URI_ENCODED_BYTE_SIZE;
|
||||
output_length += utf_size * URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
|
||||
input_char_p++;
|
||||
}
|
||||
|
||||
if (ecma_is_completion_value_empty (ret_value))
|
||||
@ -1114,26 +1156,43 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
|
||||
while (input_char_p < input_end_p)
|
||||
{
|
||||
/* Input decode. */
|
||||
input_char_p += lit_read_code_unit_from_utf8 (input_char_p, &ch);
|
||||
cp = ch;
|
||||
|
||||
if (*input_char_p <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
if (lit_is_code_unit_high_surrogate (ch))
|
||||
{
|
||||
if (ecma_builtin_global_object_character_is_in (*input_char_p, unescaped_uri_bitset_p))
|
||||
ecma_char_t next_ch;
|
||||
lit_utf8_size_t read_size = lit_read_code_unit_from_utf8 (input_char_p, &next_ch);
|
||||
|
||||
if (lit_is_code_unit_low_surrogate (next_ch))
|
||||
{
|
||||
*output_char_p++ = *input_char_p;
|
||||
cp = lit_convert_surrogate_pair_to_code_point (ch, next_ch);
|
||||
input_char_p += read_size;
|
||||
}
|
||||
}
|
||||
|
||||
lit_utf8_size_t utf_size = lit_code_point_to_utf8 (cp, octets);
|
||||
|
||||
if (utf_size == 1)
|
||||
{
|
||||
if (ecma_builtin_global_object_character_is_in (octets[0], unescaped_uri_bitset_p))
|
||||
{
|
||||
*output_char_p++ = octets[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p);
|
||||
ecma_builtin_global_object_byte_to_hex (output_char_p, octets[0]);
|
||||
output_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ecma_builtin_global_object_byte_to_hex (output_char_p, *input_char_p);
|
||||
output_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
for (uint32_t i = 0; i < utf_size; i++)
|
||||
{
|
||||
ecma_builtin_global_object_byte_to_hex (output_char_p, octets[i]);
|
||||
output_char_p += URI_ENCODED_BYTE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
input_char_p++;
|
||||
}
|
||||
|
||||
JERRY_ASSERT (output_start_p + output_length == output_char_p);
|
||||
|
||||
@ -186,7 +186,7 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
|
||||
}
|
||||
|
||||
current_p += 5;
|
||||
write_p += lit_code_point_to_utf8 (code_point, write_p);
|
||||
write_p += lit_code_point_to_cesu8 (code_point, write_p);
|
||||
continue;
|
||||
/* FALLTHRU */
|
||||
}
|
||||
@ -199,57 +199,6 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
|
||||
*write_p++ = *current_p++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Post processing surrogate pairs.
|
||||
*
|
||||
* The general issue is, that surrogate fragments can come from
|
||||
* the original stream and can be constructed by \u sequences
|
||||
* as well. We need to construct code points from them.
|
||||
*
|
||||
* Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00"
|
||||
* The first \u is parsed by JSON, the second is by the lexer.
|
||||
*
|
||||
* The rewrite happens in-place, since the write pointer is always
|
||||
* precede the read-pointer. We also cannot create an UTF8 iterator,
|
||||
* because the lit_is_utf8_string_valid assertion may fail.
|
||||
*/
|
||||
|
||||
lit_utf8_byte_t *read_p = token_p->u.string.start_p;
|
||||
lit_utf8_byte_t *read_end_p = write_p;
|
||||
write_p = read_p;
|
||||
|
||||
while (read_p < read_end_p)
|
||||
{
|
||||
lit_code_point_t code_point;
|
||||
read_p += lit_read_code_point_from_utf8 (read_p,
|
||||
(lit_utf8_size_t) (read_end_p - read_p),
|
||||
&code_point);
|
||||
|
||||
/* The lit_is_code_unit_high_surrogate expects ecma_char_t argument
|
||||
so code_points above maximum UTF16 code unit must not be tested. */
|
||||
if (read_p < read_end_p
|
||||
&& code_point <= LIT_UTF16_CODE_UNIT_MAX
|
||||
&& lit_is_code_unit_high_surrogate ((ecma_char_t) code_point))
|
||||
{
|
||||
lit_code_point_t next_code_point;
|
||||
lit_utf8_size_t next_code_point_size = lit_read_code_point_from_utf8 (read_p,
|
||||
(lit_utf8_size_t) (read_end_p - read_p),
|
||||
&next_code_point);
|
||||
|
||||
if (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
|
||||
&& lit_is_code_unit_low_surrogate ((ecma_char_t) next_code_point))
|
||||
{
|
||||
code_point = lit_convert_surrogate_pair_to_code_point ((ecma_char_t) code_point,
|
||||
(ecma_char_t) next_code_point);
|
||||
read_p += next_code_point_size;
|
||||
}
|
||||
}
|
||||
write_p += lit_code_point_to_utf8 (code_point, write_p);
|
||||
}
|
||||
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (token_p->u.string.start_p,
|
||||
(lit_utf8_size_t) (write_p - token_p->u.string.start_p)));
|
||||
|
||||
token_p->u.string.size = (lit_utf8_size_t) (write_p - token_p->u.string.start_p);
|
||||
token_p->current_p = current_p + 1;
|
||||
token_p->type = string_token;
|
||||
|
||||
@ -2306,26 +2306,9 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /
|
||||
{
|
||||
ecma_char_t character = lit_utf8_iterator_read_next (&input_iterator);
|
||||
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
|
||||
lit_utf8_byte_t utf8_byte_buffer[LIT_UTF8_MAX_BYTES_IN_CODE_POINT];
|
||||
lit_utf8_byte_t utf8_byte_buffer[LIT_CESU8_MAX_BYTES_IN_CODE_POINT];
|
||||
lit_utf8_size_t character_length;
|
||||
|
||||
/*
|
||||
* We need to keep surrogate pairs. Surrogates are never converted,
|
||||
* regardless they form a valid pair or not.
|
||||
*/
|
||||
if (lit_is_code_unit_high_surrogate (character))
|
||||
{
|
||||
ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);
|
||||
|
||||
if (lit_is_code_unit_low_surrogate (next_character))
|
||||
{
|
||||
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
|
||||
output_length += lit_code_point_to_utf8 (surrogate_code_point, utf8_byte_buffer);
|
||||
lit_utf8_iterator_incr (&input_iterator);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (lower_case)
|
||||
{
|
||||
character_length = lit_char_to_lower_case (character,
|
||||
@ -2364,23 +2347,6 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /
|
||||
ecma_char_t character_buffer[LIT_MAXIMUM_OTHER_CASE_LENGTH];
|
||||
lit_utf8_size_t character_length;
|
||||
|
||||
/*
|
||||
* We need to keep surrogate pairs. Surrogates are never converted,
|
||||
* regardless they form a valid pair or not.
|
||||
*/
|
||||
if (lit_is_code_unit_high_surrogate (character))
|
||||
{
|
||||
ecma_char_t next_character = lit_utf8_iterator_peek_next (&input_iterator);
|
||||
|
||||
if (lit_is_code_unit_low_surrogate (next_character))
|
||||
{
|
||||
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (character, next_character);
|
||||
output_char_p += lit_code_point_to_utf8 (surrogate_code_point, output_char_p);
|
||||
lit_utf8_iterator_incr (&input_iterator);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (lower_case)
|
||||
{
|
||||
character_length = lit_char_to_lower_case (character,
|
||||
@ -2398,7 +2364,7 @@ ecma_builtin_string_prototype_object_conversion_helper (ecma_value_t this_arg, /
|
||||
|
||||
for (lit_utf8_size_t i = 0; i < character_length; i++)
|
||||
{
|
||||
output_char_p += lit_code_point_to_utf8 (character_buffer[i], output_char_p);
|
||||
output_char_p += lit_code_unit_to_utf8 (character_buffer[i], output_char_p);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2503,60 +2469,8 @@ ecma_builtin_string_prototype_object_trim (ecma_value_t this_arg) /**< this argu
|
||||
|
||||
ecma_string_t *original_string_p = ecma_get_string_from_value (to_string_val);
|
||||
|
||||
/* 3 */
|
||||
const lit_utf8_size_t size = ecma_string_get_size (original_string_p);
|
||||
|
||||
/* Workaround: avoid repeated call of ecma_string_get_char_at_pos() because its overhead */
|
||||
lit_utf8_byte_t *original_utf8_str_p = (lit_utf8_byte_t *) mem_heap_alloc_block (size + 1,
|
||||
MEM_HEAP_ALLOC_SHORT_TERM);
|
||||
ssize_t sz = ecma_string_to_utf8_string (original_string_p, original_utf8_str_p, (ssize_t) size);
|
||||
JERRY_ASSERT (sz >= 0);
|
||||
|
||||
const ecma_length_t length = lit_utf8_string_length (original_utf8_str_p, size);
|
||||
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (original_utf8_str_p, size);
|
||||
|
||||
uint32_t prefix = 0, postfix = 0;
|
||||
uint32_t new_len = 0;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter))
|
||||
{
|
||||
ecma_char_t current_char = lit_utf8_iterator_read_next (&iter);
|
||||
|
||||
if (lit_char_is_white_space (current_char)
|
||||
|| lit_char_is_line_terminator (current_char))
|
||||
{
|
||||
prefix++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lit_utf8_iterator_seek_eos (&iter);
|
||||
while (!lit_utf8_iterator_is_bos (&iter))
|
||||
{
|
||||
ecma_char_t current_char = lit_utf8_iterator_read_prev (&iter);
|
||||
|
||||
if (lit_char_is_white_space (current_char)
|
||||
|| lit_char_is_line_terminator (current_char))
|
||||
{
|
||||
postfix++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
new_len = prefix < length ? length - prefix - postfix : 0;
|
||||
|
||||
ecma_string_t *new_str_p = ecma_string_substr (original_string_p, prefix, prefix + new_len);
|
||||
|
||||
/* 4 */
|
||||
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (new_str_p));
|
||||
|
||||
mem_heap_free_block (original_utf8_str_p);
|
||||
ecma_string_t *trimmed_string_p = ecma_string_trim (original_string_p);
|
||||
ret_value = ecma_make_normal_completion_value (ecma_make_string_value (trimmed_string_p));
|
||||
|
||||
ECMA_FINALIZE (to_string_val);
|
||||
ECMA_FINALIZE (check_coercible_val);
|
||||
|
||||
@ -67,15 +67,13 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
|
||||
}
|
||||
else
|
||||
{
|
||||
lit_utf8_size_t utf8_buf_size = args_number * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
|
||||
lit_utf8_size_t utf8_buf_size = args_number * LIT_CESU8_MAX_BYTES_IN_CODE_UNIT;
|
||||
|
||||
MEM_DEFINE_LOCAL_ARRAY (utf8_buf_p,
|
||||
utf8_buf_size,
|
||||
lit_utf8_byte_t);
|
||||
|
||||
lit_utf8_size_t utf8_buf_used = 0;
|
||||
lit_utf8_size_t last_code_unit_size = 0;
|
||||
ecma_char_t high_surrogate = 0;
|
||||
|
||||
for (ecma_length_t arg_index = 0;
|
||||
arg_index < args_number && ecma_is_completion_value_empty (ret_value);
|
||||
@ -86,37 +84,10 @@ ecma_builtin_string_object_from_char_code (ecma_value_t this_arg __attr_unused__
|
||||
uint32_t uint32_char_code = ecma_number_to_uint32 (arg_num);
|
||||
ecma_char_t code_unit = (uint16_t) uint32_char_code;
|
||||
|
||||
if (high_surrogate && lit_is_code_unit_low_surrogate (code_unit))
|
||||
{
|
||||
JERRY_ASSERT (last_code_unit_size > 0);
|
||||
JERRY_ASSERT (utf8_buf_used >= last_code_unit_size);
|
||||
|
||||
utf8_buf_used -= last_code_unit_size;
|
||||
|
||||
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||
|
||||
lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (high_surrogate, code_unit);
|
||||
|
||||
last_code_unit_size = lit_code_point_to_utf8 (code_point, utf8_buf_p + utf8_buf_used);
|
||||
}
|
||||
else
|
||||
{
|
||||
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
|
||||
last_code_unit_size = lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used);
|
||||
}
|
||||
|
||||
utf8_buf_used += last_code_unit_size;
|
||||
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size - LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
|
||||
utf8_buf_used += lit_code_unit_to_utf8 (code_unit, utf8_buf_p + utf8_buf_used);
|
||||
JERRY_ASSERT (utf8_buf_used <= utf8_buf_size);
|
||||
|
||||
if (lit_is_code_unit_high_surrogate (code_unit))
|
||||
{
|
||||
high_surrogate = code_unit;
|
||||
}
|
||||
else
|
||||
{
|
||||
high_surrogate = 0;
|
||||
}
|
||||
|
||||
ECMA_OP_TO_NUMBER_FINALIZE (arg_num);
|
||||
}
|
||||
|
||||
|
||||
@ -360,6 +360,7 @@ lit_read_code_point_from_hex (const lit_utf8_byte_t *buf_p, /**< buffer with cha
|
||||
|
||||
buf_p++;
|
||||
}
|
||||
|
||||
*out_code_point_p = code_point;
|
||||
return true;
|
||||
} /* lit_read_code_point_from_hex */
|
||||
|
||||
@ -93,6 +93,16 @@ typedef ecma_char_t *ecma_char_ptr_t;
|
||||
*/
|
||||
#define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4)
|
||||
|
||||
/**
|
||||
* Max bytes needed to represent a code unit (utf-16 char) via cesu-8 encoding
|
||||
*/
|
||||
#define LIT_CESU8_MAX_BYTES_IN_CODE_UNIT (3)
|
||||
|
||||
/**
|
||||
* Max bytes needed to represent a code point (Unicode character) via cesu-8 encoding
|
||||
*/
|
||||
#define LIT_CESU8_MAX_BYTES_IN_CODE_POINT (6)
|
||||
|
||||
/**
|
||||
* A byte of utf-8 string
|
||||
*/
|
||||
|
||||
@ -57,7 +57,7 @@ lit_dump_literals ()
|
||||
*/
|
||||
literal_t
|
||||
lit_create_literal_from_utf8_string (const lit_utf8_byte_t *str_p, /**< string to initialize the record,
|
||||
* could be non-zero-terminated */
|
||||
* could be non-zero-terminated */
|
||||
lit_utf8_size_t str_size) /**< length of the string */
|
||||
{
|
||||
JERRY_ASSERT (str_p || !str_size);
|
||||
|
||||
@ -182,7 +182,7 @@ lit_magic_strings_ex_set (const lit_utf8_byte_t **ex_str_items, /**< character a
|
||||
|
||||
|
||||
/**
|
||||
* Check if passed utf-8 string equals to one of magic strings
|
||||
* Check if passed cesu-8 string equals to one of magic strings
|
||||
* and if equal magic string was found, return it's id in 'out_id_p' argument.
|
||||
*
|
||||
* @return true - if magic string equal to passed string was found,
|
||||
|
||||
@ -152,6 +152,76 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
|
||||
return true;
|
||||
} /* lit_is_utf8_string_valid */
|
||||
|
||||
/**
|
||||
* Validate cesu-8 string
|
||||
*
|
||||
* @return true if cesu-8 string is well-formed
|
||||
* false otherwise
|
||||
*/
|
||||
bool
|
||||
lit_is_cesu8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
||||
lit_utf8_size_t buf_size) /**< string size */
|
||||
{
|
||||
lit_utf8_size_t idx = 0;
|
||||
|
||||
while (idx < buf_size)
|
||||
{
|
||||
lit_utf8_byte_t c = utf8_buf_p[idx++];
|
||||
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
lit_code_point_t code_point = 0;
|
||||
lit_code_point_t min_code_point = 0;
|
||||
lit_utf8_size_t extra_bytes_count;
|
||||
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
||||
{
|
||||
extra_bytes_count = 1;
|
||||
min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
|
||||
code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
|
||||
}
|
||||
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
|
||||
{
|
||||
extra_bytes_count = 2;
|
||||
min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
|
||||
code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (idx + extra_bytes_count > buf_size)
|
||||
{
|
||||
/* utf-8 string breaks in the middle */
|
||||
return false;
|
||||
}
|
||||
|
||||
for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
|
||||
{
|
||||
c = utf8_buf_p[idx + offset];
|
||||
if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
||||
{
|
||||
/* invalid continuation byte */
|
||||
return false;
|
||||
}
|
||||
code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
||||
code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
}
|
||||
|
||||
if (code_point < min_code_point)
|
||||
{
|
||||
/* utf-8 string doesn't encode valid unicode code point */
|
||||
return false;
|
||||
}
|
||||
|
||||
idx += extra_bytes_count;
|
||||
}
|
||||
|
||||
return true;
|
||||
} /* lit_is_cesu8_string_valid */
|
||||
|
||||
/**
|
||||
* Check if the code unit type is low surrogate
|
||||
*
|
||||
@ -184,7 +254,8 @@ lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
|
||||
lit_utf8_size_t buf_size) /**< string size */
|
||||
{
|
||||
JERRY_ASSERT (utf8_buf_p || !buf_size);
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size));
|
||||
/* TODO: Add back when builtins no longer use iterators */
|
||||
/* JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size)); */
|
||||
|
||||
lit_utf8_iterator_t buf_iter =
|
||||
{
|
||||
@ -524,7 +595,7 @@ lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated
|
||||
} /* lit_zt_utf8_string_size */
|
||||
|
||||
/**
|
||||
* Calculate length of a utf-8 string
|
||||
* Calculate length of a cesu-8 string
|
||||
*
|
||||
* @return UTF-16 code units count
|
||||
*/
|
||||
@ -533,13 +604,15 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
||||
lit_utf8_size_t utf8_buf_size) /**< string size */
|
||||
{
|
||||
ecma_length_t length = 0;
|
||||
lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
|
||||
while (!lit_utf8_iterator_is_eos (&buf_iter))
|
||||
lit_utf8_size_t size = 0;
|
||||
|
||||
while (size < utf8_buf_size)
|
||||
{
|
||||
lit_utf8_iterator_read_next (&buf_iter);
|
||||
size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
|
||||
length++;
|
||||
}
|
||||
JERRY_ASSERT (lit_utf8_iterator_is_eos (&buf_iter));
|
||||
|
||||
JERRY_ASSERT (size == utf8_buf_size);
|
||||
|
||||
return length;
|
||||
} /* lit_utf8_string_length */
|
||||
@ -597,6 +670,158 @@ lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with ch
|
||||
return bytes_count;
|
||||
} /* lit_read_code_point_from_utf8 */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return number of bytes occupied by code point in the string
|
||||
*/
|
||||
lit_utf8_size_t
|
||||
lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
||||
ecma_char_t *code_point) /**< @out: code point */
|
||||
{
|
||||
JERRY_ASSERT (buf_p);
|
||||
|
||||
lit_utf8_byte_t c = buf_p[0];
|
||||
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
||||
{
|
||||
*code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
|
||||
return 1;
|
||||
}
|
||||
|
||||
lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
|
||||
ecma_length_t bytes_count;
|
||||
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
||||
{
|
||||
bytes_count = 2;
|
||||
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
|
||||
}
|
||||
else
|
||||
{
|
||||
JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
|
||||
bytes_count = 3;
|
||||
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
|
||||
}
|
||||
|
||||
for (uint32_t i = 1; i < bytes_count; ++i)
|
||||
{
|
||||
ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
||||
ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
}
|
||||
|
||||
JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
|
||||
*code_point = (ecma_char_t) ret;
|
||||
return bytes_count;
|
||||
} /* lit_read_code_unit_from_utf8 */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return number of bytes occupied by code point in the string
|
||||
*/
|
||||
lit_utf8_size_t
|
||||
lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
||||
ecma_char_t *code_point) /**< @out: code point */
|
||||
{
|
||||
JERRY_ASSERT (buf_p);
|
||||
|
||||
lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) buf_p;
|
||||
|
||||
lit_utf8_decr (¤t_p);
|
||||
return lit_read_code_unit_from_utf8 (current_p, code_point);
|
||||
} /* lit_read_prev_code_unit_from_utf8 */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return read character
|
||||
*/
|
||||
ecma_char_t
|
||||
lit_utf8_read_next (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (*buf_p);
|
||||
ecma_char_t ch;
|
||||
|
||||
*buf_p += lit_read_code_unit_from_utf8 (*buf_p, &ch);
|
||||
|
||||
return ch;
|
||||
} /* lit_utf8_read_next */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return read character
|
||||
*/
|
||||
ecma_char_t
|
||||
lit_utf8_read_prev (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (*buf_p);
|
||||
ecma_char_t ch;
|
||||
|
||||
lit_utf8_decr (buf_p);
|
||||
lit_read_code_unit_from_utf8 (*buf_p, &ch);
|
||||
|
||||
return ch;
|
||||
} /* lit_utf8_read_prev */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return read character
|
||||
*/
|
||||
ecma_char_t
|
||||
lit_utf8_peek_next (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (buf_p);
|
||||
ecma_char_t ch;
|
||||
|
||||
lit_read_code_unit_from_utf8 (buf_p, &ch);
|
||||
|
||||
return ch;
|
||||
} /* lit_utf8_peek_next */
|
||||
|
||||
/**
|
||||
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
||||
*
|
||||
* @return read character
|
||||
*/
|
||||
ecma_char_t
|
||||
lit_utf8_peek_prev (lit_utf8_byte_t *buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (buf_p);
|
||||
ecma_char_t ch;
|
||||
|
||||
lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
|
||||
|
||||
return ch;
|
||||
} /* lit_utf8_peek_prev */
|
||||
|
||||
/**
|
||||
* Increase character pointer by one code unit.
|
||||
*/
|
||||
void
|
||||
lit_utf8_incr (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (*buf_p);
|
||||
|
||||
*buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
|
||||
} /* lit_utf8_incr */
|
||||
|
||||
/**
|
||||
* Decrease character pointer by one code unit.
|
||||
*/
|
||||
void
|
||||
lit_utf8_decr (lit_utf8_byte_t **buf_p) /**< in-out:buffer with characters */
|
||||
{
|
||||
JERRY_ASSERT (*buf_p);
|
||||
lit_utf8_byte_t *current_p = *buf_p;
|
||||
do
|
||||
{
|
||||
current_p--;
|
||||
}
|
||||
while ((*(current_p) & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||
*buf_p = current_p;
|
||||
} /* lit_utf8_decr */
|
||||
|
||||
/**
|
||||
* Calc hash using the specified hash_basis.
|
||||
*
|
||||
@ -653,13 +878,13 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri
|
||||
lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
|
||||
ecma_length_t code_unit_offset) /**< ofset of a code_unit */
|
||||
{
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
|
||||
lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
|
||||
ecma_char_t code_unit;
|
||||
|
||||
do
|
||||
{
|
||||
JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter));
|
||||
code_unit = lit_utf8_iterator_read_next (&iter);
|
||||
JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
|
||||
current_p += lit_read_code_unit_from_utf8 (current_p, &code_unit);
|
||||
}
|
||||
while (code_unit_offset--);
|
||||
|
||||
@ -667,12 +892,12 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri
|
||||
} /* lit_utf8_string_code_unit_at */
|
||||
|
||||
/**
|
||||
* Return number of bytes occupied by a unicode character in utf-8 representation
|
||||
* Get CESU-8 encoded size of character
|
||||
*
|
||||
* @return size of a unicode character in utf-8 format
|
||||
* @return number of bytes occupied in CESU-8
|
||||
*/
|
||||
lit_utf8_size_t
|
||||
lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t first_byte) /**< first byte of a utf-8 byte sequence */
|
||||
lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
|
||||
{
|
||||
if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
||||
{
|
||||
@ -682,19 +907,15 @@ lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t first_byte) /**< f
|
||||
{
|
||||
return 2;
|
||||
}
|
||||
else if ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
|
||||
{
|
||||
return 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
JERRY_ASSERT ((first_byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
|
||||
return 4;
|
||||
JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
|
||||
return 3;
|
||||
}
|
||||
} /* lit_get_unicode_char_size_by_utf8_first_byte */
|
||||
|
||||
/**
|
||||
* Convert code_unit to utf-8 representation
|
||||
* Convert code_unit to cesu-8 representation
|
||||
*
|
||||
* @return bytes count, stored required to represent specified code unit
|
||||
*/
|
||||
@ -703,9 +924,65 @@ lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
|
||||
lit_utf8_byte_t *buf_p) /**< buffer where to store the result,
|
||||
* its size should be at least MAX_BYTES_IN_CODE_UNIT */
|
||||
{
|
||||
return lit_code_point_to_utf8 (code_unit, buf_p);
|
||||
if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
{
|
||||
buf_p[0] = (lit_utf8_byte_t) code_unit;
|
||||
return 1;
|
||||
}
|
||||
else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
|
||||
{
|
||||
uint32_t code_unit_bits = code_unit;
|
||||
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
||||
|
||||
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
|
||||
JERRY_ASSERT (first_byte_bits == code_unit_bits);
|
||||
|
||||
buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
|
||||
buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
||||
return 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t code_unit_bits = code_unit;
|
||||
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
||||
|
||||
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
||||
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
||||
|
||||
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
|
||||
JERRY_ASSERT (first_byte_bits == code_unit_bits);
|
||||
|
||||
buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
|
||||
buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
||||
buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
|
||||
return 3;
|
||||
}
|
||||
} /* lit_code_unit_to_utf8 */
|
||||
|
||||
/**
|
||||
* Convert code point to cesu-8 representation
|
||||
*
|
||||
* @return bytes count, stored required to represent specified code unit
|
||||
*/
|
||||
lit_utf8_size_t
|
||||
lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
|
||||
lit_utf8_byte_t *buf) /**< buffer where to store the result,
|
||||
* its size should be at least 6 bytes */
|
||||
{
|
||||
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
||||
{
|
||||
return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
|
||||
}
|
||||
else
|
||||
{
|
||||
lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
|
||||
offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
|
||||
return offset;
|
||||
}
|
||||
} /* lit_code_point_to_utf8 */
|
||||
|
||||
/**
|
||||
* Convert code point to utf-8 representation
|
||||
*
|
||||
@ -799,7 +1076,7 @@ lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high
|
||||
} /* lit_surrogate_pair_to_code_point */
|
||||
|
||||
/**
|
||||
* Compare utf-8 string to utf-8 string
|
||||
* Compare cesu-8 string to cesu-8 string
|
||||
*
|
||||
* @return true - if strings are equal;
|
||||
* false - otherwise.
|
||||
@ -819,7 +1096,7 @@ lit_compare_utf8_strings (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
|
||||
} /* lit_compare_utf8_strings */
|
||||
|
||||
/**
|
||||
* Relational compare of utf-8 strings
|
||||
* Relational compare of cesu-8 strings
|
||||
*
|
||||
* First string is less than second string if:
|
||||
* - strings are not equal;
|
||||
@ -833,25 +1110,28 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
|
||||
const lit_utf8_byte_t *string2_p, /**< utf-8 string */
|
||||
lit_utf8_size_t string2_size) /**< string size */
|
||||
{
|
||||
lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size);
|
||||
lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size);
|
||||
lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
|
||||
lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
|
||||
const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
|
||||
const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter1)
|
||||
&& !lit_utf8_iterator_is_eos (&iter2))
|
||||
while (string1_pos < string1_end_p && string2_pos < string2_end_p)
|
||||
{
|
||||
ecma_char_t code_point1 = lit_utf8_iterator_read_next (&iter1);
|
||||
ecma_char_t code_point2 = lit_utf8_iterator_read_next (&iter2);
|
||||
if (code_point1 < code_point2)
|
||||
ecma_char_t ch1, ch2;
|
||||
string1_pos += lit_read_code_unit_from_utf8 (string1_pos, &ch1);
|
||||
string2_pos += lit_read_code_unit_from_utf8 (string2_pos, &ch2);
|
||||
|
||||
if (ch1 < ch2)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
else if (code_point1 > code_point2)
|
||||
else if (ch1 > ch2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
|
||||
return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
|
||||
} /* lit_compare_utf8_strings_relational */
|
||||
|
||||
/**
|
||||
|
||||
@ -123,6 +123,7 @@ int32_t lit_utf8_iterator_pos_cmp (lit_utf8_iterator_pos_t, lit_utf8_iterator_po
|
||||
|
||||
/* validation */
|
||||
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
bool lit_is_cesu8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
|
||||
/* checks */
|
||||
bool lit_is_code_unit_low_surrogate (ecma_char_t);
|
||||
@ -169,17 +170,31 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
|
||||
/* conversion */
|
||||
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
|
||||
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
|
||||
lit_utf8_size_t lit_code_point_to_cesu8 (lit_code_point_t, lit_utf8_byte_t *);
|
||||
lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t);
|
||||
|
||||
/* comparison */
|
||||
bool lit_compare_utf8_strings (const lit_utf8_byte_t *, lit_utf8_size_t, const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
bool lit_compare_utf8_strings (const lit_utf8_byte_t *, lit_utf8_size_t,
|
||||
const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
|
||||
bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *, lit_utf8_size_t,
|
||||
const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, lit_utf8_size_t,
|
||||
const lit_utf8_byte_t *string2_p, lit_utf8_size_t);
|
||||
|
||||
/* read code point from buffer */
|
||||
lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *);
|
||||
|
||||
lit_utf8_size_t lit_read_code_unit_from_utf8 (const lit_utf8_byte_t *,
|
||||
ecma_char_t *);
|
||||
|
||||
lit_utf8_size_t lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *,
|
||||
ecma_char_t *);
|
||||
|
||||
ecma_char_t lit_utf8_read_next (lit_utf8_byte_t **);
|
||||
ecma_char_t lit_utf8_read_prev (lit_utf8_byte_t **);
|
||||
ecma_char_t lit_utf8_peek_next (lit_utf8_byte_t *);
|
||||
ecma_char_t lit_utf8_peek_prev (lit_utf8_byte_t *);
|
||||
void lit_utf8_incr (lit_utf8_byte_t **);
|
||||
void lit_utf8_decr (lit_utf8_byte_t **);
|
||||
|
||||
/* print */
|
||||
void lit_put_ecma_char (ecma_char_t);
|
||||
|
||||
|
||||
@ -160,17 +160,61 @@ lexer_create_token_for_charset (token_type tt, /**< token type */
|
||||
{
|
||||
JERRY_ASSERT (charset_p != NULL);
|
||||
|
||||
literal_t lit = lit_find_literal_by_utf8_string (charset_p, size);
|
||||
if (lit != NULL)
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (charset_p, (lit_utf8_size_t) size);
|
||||
lit_utf8_size_t new_size = 0;
|
||||
lit_utf8_size_t new_length = 0;
|
||||
bool should_convert = false;
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter))
|
||||
{
|
||||
return create_token_from_lit (tt, lit);
|
||||
if (iter.buf_pos.is_non_bmp_middle)
|
||||
{
|
||||
should_convert = true;
|
||||
}
|
||||
lit_utf8_iterator_incr (&iter);
|
||||
new_size += LIT_CESU8_MAX_BYTES_IN_CODE_UNIT;
|
||||
}
|
||||
|
||||
lit = lit_create_literal_from_utf8_string (charset_p, size);
|
||||
lit_utf8_byte_t *converted_str_p;
|
||||
|
||||
if (should_convert)
|
||||
{
|
||||
lit_utf8_iterator_seek_bos (&iter);
|
||||
converted_str_p = (lit_utf8_byte_t *) jsp_mm_alloc (new_size);
|
||||
|
||||
while (!lit_utf8_iterator_is_eos (&iter))
|
||||
{
|
||||
ecma_char_t ch = lit_utf8_iterator_read_next (&iter);
|
||||
new_length += lit_code_unit_to_utf8 (ch, converted_str_p + new_length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
converted_str_p = (lit_utf8_byte_t *) charset_p;
|
||||
new_length = size;
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (converted_str_p, new_length));
|
||||
}
|
||||
|
||||
literal_t lit = lit_find_literal_by_utf8_string (converted_str_p, new_length);
|
||||
if (lit != NULL)
|
||||
{
|
||||
if (should_convert)
|
||||
{
|
||||
jsp_mm_free (converted_str_p);
|
||||
}
|
||||
|
||||
return create_token_from_lit (tt, lit);
|
||||
}
|
||||
lit = lit_create_literal_from_utf8_string (converted_str_p, new_length);
|
||||
JERRY_ASSERT (lit->get_type () == LIT_STR_T
|
||||
|| lit->get_type () == LIT_MAGIC_STR_T
|
||||
|| lit->get_type () == LIT_MAGIC_STR_EX_T);
|
||||
|
||||
if (should_convert)
|
||||
{
|
||||
jsp_mm_free (converted_str_p);
|
||||
}
|
||||
|
||||
return create_token_from_lit (tt, lit);
|
||||
} /* lexer_create_token_for_charset */
|
||||
|
||||
@ -1550,6 +1594,7 @@ lexer_locus_to_line_and_column (lit_utf8_iterator_pos_t locus, /**< iterator pos
|
||||
size_t *column) /**< @out: column number */
|
||||
{
|
||||
JERRY_ASSERT ((lit_utf8_size_t) (locus.offset + locus.is_non_bmp_middle) <= buffer_size);
|
||||
|
||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (buffer_start, (lit_utf8_size_t) buffer_size);
|
||||
lit_utf8_iterator_pos_t iter_pos = lit_utf8_iterator_get_pos (&iter);
|
||||
|
||||
|
||||
@ -30,18 +30,17 @@
|
||||
|
||||
typedef enum
|
||||
{
|
||||
UTF8_ANY_SIZE,
|
||||
UTF8_ONE_BYTE,
|
||||
UTF8_TWO_BYTES,
|
||||
UTF8_THREE_BYTES,
|
||||
UTF8_FOUR_BYTES
|
||||
CESU8_ANY_SIZE,
|
||||
CESU8_ONE_BYTE,
|
||||
CESU8_TWO_BYTES,
|
||||
CESU8_THREE_BYTES,
|
||||
} utf8_char_size;
|
||||
|
||||
static lit_utf8_size_t
|
||||
generate_utf8_char (utf8_char_size char_size,
|
||||
lit_utf8_byte_t *buf)
|
||||
generate_cesu8_char (utf8_char_size char_size,
|
||||
lit_utf8_byte_t *buf)
|
||||
{
|
||||
JERRY_ASSERT (char_size >= 0 && char_size <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||
JERRY_ASSERT (char_size >= 0 && char_size <= LIT_CESU8_MAX_BYTES_IN_CODE_UNIT);
|
||||
lit_code_point_t code_point = (lit_code_point_t) rand ();
|
||||
|
||||
if (char_size == 1)
|
||||
@ -58,14 +57,9 @@ generate_utf8_char (utf8_char_size char_size,
|
||||
code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_3_BYTE_CODE_POINT_MAX -
|
||||
LIT_UTF8_3_BYTE_CODE_POINT_MIN);
|
||||
}
|
||||
else if (char_size == 4)
|
||||
{
|
||||
code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_4_BYTE_CODE_POINT_MAX -
|
||||
LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||
}
|
||||
else
|
||||
{
|
||||
code_point %= LIT_UTF8_4_BYTE_CODE_POINT_MAX;
|
||||
code_point %= LIT_UTF8_3_BYTE_CODE_POINT_MAX;
|
||||
}
|
||||
|
||||
if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
|
||||
@ -74,29 +68,29 @@ generate_utf8_char (utf8_char_size char_size,
|
||||
code_point = LIT_UTF16_HIGH_SURROGATE_MIN - 1;
|
||||
}
|
||||
|
||||
return lit_code_point_to_utf8 (code_point, buf);
|
||||
return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
|
||||
}
|
||||
|
||||
static ecma_length_t
|
||||
generate_utf8_string (lit_utf8_byte_t *buf_p,
|
||||
lit_utf8_size_t buf_size)
|
||||
generate_cesu8_string (lit_utf8_byte_t *buf_p,
|
||||
lit_utf8_size_t buf_size)
|
||||
{
|
||||
ecma_length_t length = 0;
|
||||
|
||||
lit_utf8_size_t size = 0;
|
||||
while (size < buf_size)
|
||||
{
|
||||
const utf8_char_size char_size = (((buf_size - size) > LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
|
||||
? UTF8_ANY_SIZE
|
||||
const utf8_char_size char_size = (((buf_size - size) > LIT_CESU8_MAX_BYTES_IN_CODE_UNIT)
|
||||
? CESU8_ANY_SIZE
|
||||
: (utf8_char_size) (buf_size - size));
|
||||
|
||||
lit_utf8_size_t bytes_generated = generate_utf8_char (char_size, buf_p);
|
||||
lit_utf8_size_t bytes_generated = generate_cesu8_char (char_size, buf_p);
|
||||
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (buf_p, bytes_generated));
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (buf_p, bytes_generated));
|
||||
|
||||
size += bytes_generated;
|
||||
buf_p += bytes_generated;
|
||||
length += (bytes_generated == LIT_UTF8_MAX_BYTES_IN_CODE_POINT) ? 2 : 1;
|
||||
length++;
|
||||
}
|
||||
|
||||
JERRY_ASSERT (size == buf_size);
|
||||
@ -120,7 +114,7 @@ main (int __attr_unused___ argc,
|
||||
for (int i = 0; i < test_iters; i++)
|
||||
{
|
||||
lit_utf8_size_t utf8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string);
|
||||
ecma_length_t length = generate_utf8_string (utf8_string, utf8_string_size);
|
||||
ecma_length_t length = generate_cesu8_string (utf8_string, utf8_string_size);
|
||||
|
||||
JERRY_ASSERT (lit_utf8_string_length (utf8_string, utf8_string_size) == length);
|
||||
|
||||
@ -183,29 +177,19 @@ main (int __attr_unused___ argc,
|
||||
|
||||
/* Overlong-encoded code point */
|
||||
lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
||||
JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
||||
|
||||
/* Overlong-encoded code point */
|
||||
lit_utf8_byte_t invalid_utf8_string_2[] = {0xE0, 0x80, 0x81};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_2, sizeof (invalid_utf8_string_2)));
|
||||
JERRY_ASSERT (!lit_is_cesu8_string_valid (invalid_utf8_string_2, sizeof (invalid_utf8_string_2)));
|
||||
|
||||
/* Pair of surrogates: 0xD901 0xDFF0 which encode Unicode character 0x507F0 */
|
||||
lit_utf8_byte_t invalid_utf8_string_3[] = {0xED, 0xA4, 0x81, 0xED, 0xBF, 0xB0};
|
||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_3, sizeof (invalid_utf8_string_3)));
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (invalid_utf8_string_3, sizeof (invalid_utf8_string_3)));
|
||||
|
||||
/* Isolated high surrogate 0xD901 */
|
||||
lit_utf8_byte_t valid_utf8_string_1[] = {0xED, 0xA4, 0x81};
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1)));
|
||||
|
||||
/* 4-byte long utf-8 character - Unicode character 0x507F0 */
|
||||
lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0};
|
||||
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2)));
|
||||
|
||||
lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88};
|
||||
lit_code_point_t code_point;
|
||||
lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point);
|
||||
JERRY_ASSERT (bytes_count == 4);
|
||||
JERRY_ASSERT (code_point == 0x10348);
|
||||
JERRY_ASSERT (lit_is_cesu8_string_valid (valid_utf8_string_1, sizeof (valid_utf8_string_1)));
|
||||
|
||||
lit_utf8_byte_t res_buf[3];
|
||||
lit_utf8_size_t res_size;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user