mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
Add functions for iterating utf-8 strings.
JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
parent
0787d76b62
commit
ae3eea8ae8
@ -880,10 +880,10 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
|
|||||||
|
|
||||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
|
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
|
||||||
lit_utf8_size_t output_length = 1;
|
lit_utf8_size_t output_length = 1;
|
||||||
while (!lit_utf8_iterator_reached_buffer_end (&iter))
|
while (!lit_utf8_iterator_is_eos (&iter))
|
||||||
{
|
{
|
||||||
/* Input validation. */
|
/* Input validation. */
|
||||||
lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
|
lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
|
||||||
|
|
||||||
if (character <= 0x7f)
|
if (character <= 0x7f)
|
||||||
{
|
{
|
||||||
@ -931,10 +931,10 @@ ecma_builtin_global_object_encode_uri_helper (ecma_value_t uri, /**< uri argumen
|
|||||||
|
|
||||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
|
lit_utf8_iterator_t iter = lit_utf8_iterator_create (input_start_p, input_size);
|
||||||
lit_utf8_byte_t *output_char_p = output_start_p;
|
lit_utf8_byte_t *output_char_p = output_start_p;
|
||||||
while (!lit_utf8_iterator_reached_buffer_end (&iter))
|
while (!lit_utf8_iterator_is_eos (&iter))
|
||||||
{
|
{
|
||||||
/* Input decode. */
|
/* Input decode. */
|
||||||
lit_code_point_t character = lit_utf8_iterator_read_code_unit_and_increment (&iter);
|
lit_code_point_t character = lit_utf8_iterator_read_next (&iter);
|
||||||
|
|
||||||
if (character <= 0x7f)
|
if (character <= 0x7f)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -88,6 +88,11 @@ typedef ecma_char_t *ecma_char_ptr_t;
|
|||||||
*/
|
*/
|
||||||
#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
|
#define LIT_UTF8_MAX_BYTES_IN_CODE_UNIT (3)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Max bytes needed to represent a code point (Unicode character) via utf-8 encoding
|
||||||
|
*/
|
||||||
|
#define LIT_UTF8_MAX_BYTES_IN_CODE_POINT (4)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A byte of utf-8 string
|
* A byte of utf-8 string
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -17,54 +17,6 @@
|
|||||||
|
|
||||||
#include "jrt-libc-includes.h"
|
#include "jrt-libc-includes.h"
|
||||||
|
|
||||||
/**
|
|
||||||
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
|
||||||
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
|
|
||||||
*/
|
|
||||||
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
|
||||||
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
|
||||||
|
|
||||||
#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
|
|
||||||
#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
|
|
||||||
#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
|
|
||||||
#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
|
|
||||||
#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
|
|
||||||
#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
|
|
||||||
#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
|
|
||||||
#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
|
|
||||||
#define LIT_UTF16_BITS_IN_SURROGATE (10)
|
|
||||||
#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
|
|
||||||
|
|
||||||
#define LIT_UTF8_1_BYTE_MARKER (0x00)
|
|
||||||
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
|
|
||||||
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
|
|
||||||
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
|
|
||||||
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
|
|
||||||
|
|
||||||
#define LIT_UTF8_1_BYTE_MASK (0x80)
|
|
||||||
#define LIT_UTF8_2_BYTE_MASK (0xE0)
|
|
||||||
#define LIT_UTF8_3_BYTE_MASK (0xF0)
|
|
||||||
#define LIT_UTF8_4_BYTE_MASK (0xF8)
|
|
||||||
#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
|
|
||||||
|
|
||||||
#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
|
|
||||||
#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
|
|
||||||
#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
|
|
||||||
#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
|
|
||||||
#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
|
|
||||||
#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
|
|
||||||
#define LIT_UTF8_LAST_1_BIT_MASK (0x01)
|
|
||||||
|
|
||||||
#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
|
|
||||||
|
|
||||||
#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
|
|
||||||
#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
|
|
||||||
#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
|
|
||||||
#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
|
|
||||||
#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
|
|
||||||
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x1000)
|
|
||||||
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validate utf-8 string
|
* Validate utf-8 string
|
||||||
*
|
*
|
||||||
@ -175,18 +127,80 @@ lit_utf8_iterator_create (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
|
|||||||
lit_utf8_size_t buf_size) /**< string size */
|
lit_utf8_size_t buf_size) /**< string size */
|
||||||
{
|
{
|
||||||
JERRY_ASSERT (utf8_buf_p || !buf_size);
|
JERRY_ASSERT (utf8_buf_p || !buf_size);
|
||||||
|
JERRY_ASSERT (lit_is_utf8_string_valid (utf8_buf_p, buf_size));
|
||||||
|
|
||||||
lit_utf8_iterator_t buf_iter =
|
lit_utf8_iterator_t buf_iter =
|
||||||
{
|
{
|
||||||
0,
|
|
||||||
buf_size,
|
|
||||||
utf8_buf_p,
|
utf8_buf_p,
|
||||||
0,
|
buf_size,
|
||||||
|
{
|
||||||
|
0,
|
||||||
|
false
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
return buf_iter;
|
return buf_iter;
|
||||||
} /* lit_utf8_iterator_create */
|
} /* lit_utf8_iterator_create */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset iterator to point to the beginning of a string
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */
|
||||||
|
{
|
||||||
|
iter_p->buf_pos.offset = 0;
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||||
|
} /* lit_utf8_iterator_seek_bos */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset iterator to point to the end of a string
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *iter_p) /**< iterator to reset */
|
||||||
|
{
|
||||||
|
iter_p->buf_pos.offset = iter_p->buf_size & LIT_ITERATOR_OFFSET_MASK;
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||||
|
} /* lit_utf8_iterator_seek_eos */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Save iterator's position to restore it later
|
||||||
|
*
|
||||||
|
* @return current position of the iterator
|
||||||
|
*/
|
||||||
|
lit_utf8_iterator_pos_t
|
||||||
|
lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *iter_p)
|
||||||
|
{
|
||||||
|
return iter_p->buf_pos;
|
||||||
|
} /* lit_utf8_iterator_get_pos */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restore previously saved position of the iterator
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_seek (lit_utf8_iterator_t *iter_p, /**< utf-8 string iterator */
|
||||||
|
lit_utf8_iterator_pos_t iter_pos) /**< position to restore */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (iter_pos.offset <= iter_p->buf_size);
|
||||||
|
#ifndef JERRY_NDEBUG
|
||||||
|
lit_utf8_byte_t byte = *(iter_p->buf_p + iter_pos.offset);
|
||||||
|
JERRY_ASSERT ((byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||||
|
JERRY_ASSERT (!iter_pos.is_non_bmp_middle || ((byte & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
iter_p->buf_pos = iter_pos;
|
||||||
|
} /* lit_utf8_iterator_seek */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get offset (in code units) of the iterator
|
||||||
|
*
|
||||||
|
* @return current offset of the iterator in code units
|
||||||
|
*/
|
||||||
|
ecma_length_t
|
||||||
|
lit_utf8_iterator_get_index (const lit_utf8_iterator_t *iter_p)
|
||||||
|
{
|
||||||
|
return lit_utf8_string_length (iter_p->buf_p, iter_p->buf_pos.offset) + iter_p->buf_pos.is_non_bmp_middle;
|
||||||
|
} /* lit_utf8_iterator_get_index */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
|
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
|
||||||
*
|
*
|
||||||
@ -221,26 +235,71 @@ convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code poi
|
|||||||
} /* convert_code_point_to_low_surrogate */
|
} /* convert_code_point_to_low_surrogate */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get next code unit form the iterated string and increment iterator to point to next code unit
|
* Get next code unit form the iterated string
|
||||||
*
|
*
|
||||||
* @return next code unit
|
* @return next code unit
|
||||||
*/
|
*/
|
||||||
ecma_char_t
|
ecma_char_t
|
||||||
lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p) /**< @in-out: utf-8 string iterator */
|
lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */
|
||||||
{
|
{
|
||||||
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (buf_iter_p));
|
JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p));
|
||||||
|
|
||||||
if (buf_iter_p->code_point)
|
|
||||||
{
|
|
||||||
ecma_char_t code_unit = convert_code_point_to_low_surrogate (buf_iter_p->code_point);
|
|
||||||
buf_iter_p->code_point = 0;
|
|
||||||
return code_unit;
|
|
||||||
}
|
|
||||||
|
|
||||||
lit_code_point_t code_point;
|
lit_code_point_t code_point;
|
||||||
buf_iter_p->buf_offset += lit_read_code_point_from_utf8 (buf_iter_p->buf_p + buf_iter_p->buf_offset,
|
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
|
||||||
buf_iter_p->buf_size - buf_iter_p->buf_offset,
|
iter_p->buf_size - iter_p->buf_pos.offset,
|
||||||
&code_point);
|
&code_point);
|
||||||
|
|
||||||
|
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle);
|
||||||
|
return (ecma_char_t) code_point;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (iter_p->buf_pos.is_non_bmp_middle)
|
||||||
|
{
|
||||||
|
return convert_code_point_to_low_surrogate (code_point);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return convert_code_point_to_high_surrogate (code_point);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} /* lit_utf8_iterator_peek_next */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get previous code unit form the iterated string
|
||||||
|
*
|
||||||
|
* @return previous code unit
|
||||||
|
*/
|
||||||
|
ecma_char_t
|
||||||
|
lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *iter_p) /**< @in: utf-8 string iterator */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
|
||||||
|
|
||||||
|
lit_code_point_t code_point;
|
||||||
|
lit_utf8_size_t offset = iter_p->buf_pos.offset;
|
||||||
|
|
||||||
|
if (iter_p->buf_pos.is_non_bmp_middle)
|
||||||
|
{
|
||||||
|
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
|
||||||
|
iter_p->buf_size - iter_p->buf_pos.offset,
|
||||||
|
&code_point);
|
||||||
|
return convert_code_point_to_high_surrogate (code_point);
|
||||||
|
}
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (offset != 0);
|
||||||
|
offset--;
|
||||||
|
}
|
||||||
|
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||||
|
|
||||||
|
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||||
|
|
||||||
|
lit_read_code_point_from_utf8 (iter_p->buf_p + offset,
|
||||||
|
iter_p->buf_size - offset,
|
||||||
|
&code_point);
|
||||||
|
|
||||||
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
||||||
{
|
{
|
||||||
@ -248,32 +307,153 @@ lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *buf_iter_p)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
buf_iter_p->code_point = code_point;
|
return convert_code_point_to_low_surrogate (code_point);
|
||||||
|
}
|
||||||
|
} /* lit_utf8_iterator_peek_prev */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increment iterator to point to next code unit
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_incr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
|
||||||
|
{
|
||||||
|
lit_utf8_iterator_read_next (iter_p);
|
||||||
|
} /* lit_utf8_iterator_read_next */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decrement iterator to point to previous code unit
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_decr (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
|
||||||
|
{
|
||||||
|
lit_utf8_iterator_read_prev (iter_p);
|
||||||
|
} /* lit_utf8_iterator_decr */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip specified number of code units
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
lit_utf8_iterator_advance (lit_utf8_iterator_t *iter_p, /**< in-out: iterator */
|
||||||
|
ecma_length_t chars_count) /**< number of code units to skip */
|
||||||
|
{
|
||||||
|
while (chars_count--)
|
||||||
|
{
|
||||||
|
lit_utf8_iterator_incr (iter_p);
|
||||||
|
}
|
||||||
|
} /* lit_utf8_iterator_advance */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get next code unit form the iterated string and increment iterator to point to next code unit
|
||||||
|
*
|
||||||
|
* @return next code unit
|
||||||
|
*/
|
||||||
|
ecma_char_t
|
||||||
|
lit_utf8_iterator_read_next (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (!lit_utf8_iterator_is_eos (iter_p));
|
||||||
|
|
||||||
|
lit_code_point_t code_point;
|
||||||
|
lit_utf8_size_t utf8_char_size = lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
|
||||||
|
iter_p->buf_size - iter_p->buf_pos.offset,
|
||||||
|
&code_point);
|
||||||
|
|
||||||
|
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (!iter_p->buf_pos.is_non_bmp_middle);
|
||||||
|
iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK;
|
||||||
|
return (ecma_char_t) code_point;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (iter_p->buf_pos.is_non_bmp_middle)
|
||||||
|
{
|
||||||
|
iter_p->buf_pos.offset = (iter_p->buf_pos.offset + utf8_char_size) & LIT_ITERATOR_OFFSET_MASK;
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||||
|
return convert_code_point_to_low_surrogate (code_point);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = true;
|
||||||
|
return convert_code_point_to_high_surrogate (code_point);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} /* lit_utf8_iterator_read_next */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get previous code unit form the iterated string and decrement iterator to point to previous code unit
|
||||||
|
*
|
||||||
|
* @return previous code unit
|
||||||
|
*/
|
||||||
|
ecma_char_t
|
||||||
|
lit_utf8_iterator_read_prev (lit_utf8_iterator_t *iter_p) /**< @in-out: utf-8 string iterator */
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (!lit_utf8_iterator_is_bos (iter_p));
|
||||||
|
|
||||||
|
lit_code_point_t code_point;
|
||||||
|
lit_utf8_size_t offset = iter_p->buf_pos.offset;
|
||||||
|
|
||||||
|
if (iter_p->buf_pos.is_non_bmp_middle)
|
||||||
|
{
|
||||||
|
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
|
||||||
|
iter_p->buf_size - iter_p->buf_pos.offset,
|
||||||
|
&code_point);
|
||||||
|
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||||
|
|
||||||
return convert_code_point_to_high_surrogate (code_point);
|
return convert_code_point_to_high_surrogate (code_point);
|
||||||
}
|
}
|
||||||
|
|
||||||
JERRY_ASSERT (false);
|
do
|
||||||
return LIT_CHAR_NULL;
|
{
|
||||||
} /* lit_utf8_iterator_read_code_unit_and_increment */
|
JERRY_ASSERT (offset != 0);
|
||||||
|
offset--;
|
||||||
|
}
|
||||||
|
while ((iter_p->buf_p[offset] & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||||
|
|
||||||
|
JERRY_ASSERT (iter_p->buf_pos.offset - offset <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||||
|
|
||||||
|
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
|
||||||
|
lit_read_code_point_from_utf8 (iter_p->buf_p + iter_p->buf_pos.offset,
|
||||||
|
iter_p->buf_size - iter_p->buf_pos.offset,
|
||||||
|
&code_point);
|
||||||
|
|
||||||
|
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
||||||
|
{
|
||||||
|
return (ecma_char_t) code_point;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
iter_p->buf_pos.is_non_bmp_middle = true;
|
||||||
|
|
||||||
|
return convert_code_point_to_low_surrogate (code_point);
|
||||||
|
}
|
||||||
|
} /* lit_utf8_iterator_read_prev */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks iterator reached end of the string
|
* Checks iterator reached end of the string
|
||||||
*
|
*
|
||||||
* @return true - the whole string was iterated
|
* @return true - iterator is at the end of string
|
||||||
* false - otherwise
|
* false - otherwise
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *buf_iter_p) /**< utf-8 string iterator */
|
lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *iter_p) /**< utf-8 string iterator */
|
||||||
{
|
{
|
||||||
JERRY_ASSERT (buf_iter_p->buf_offset <= buf_iter_p->buf_size);
|
JERRY_ASSERT (iter_p->buf_pos.offset <= iter_p->buf_size);
|
||||||
|
|
||||||
if (buf_iter_p->code_point == LIT_UNICODE_CODE_POINT_NULL && buf_iter_p->buf_offset == buf_iter_p->buf_size)
|
return (iter_p->buf_pos.offset == iter_p->buf_size);
|
||||||
{
|
} /* lit_utf8_iterator_is_eos */
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
/**
|
||||||
} /* lit_utf8_iterator_reached_buffer_end */
|
* Checks iterator reached beginning of the string
|
||||||
|
*
|
||||||
|
* @return true - iterator is at the beginning of a string
|
||||||
|
* false - otherwise
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
|
||||||
|
{
|
||||||
|
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
|
||||||
|
} /* lit_utf8_iterator_is_bos */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate size of a zero-terminated utf-8 string
|
* Calculate size of a zero-terminated utf-8 string
|
||||||
@ -300,12 +480,12 @@ lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
|||||||
{
|
{
|
||||||
ecma_length_t length = 0;
|
ecma_length_t length = 0;
|
||||||
lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
|
lit_utf8_iterator_t buf_iter = lit_utf8_iterator_create (utf8_buf_p, utf8_buf_size);
|
||||||
while (!lit_utf8_iterator_reached_buffer_end (&buf_iter))
|
while (!lit_utf8_iterator_is_eos (&buf_iter))
|
||||||
{
|
{
|
||||||
lit_utf8_iterator_read_code_unit_and_increment (&buf_iter);
|
lit_utf8_iterator_read_next (&buf_iter);
|
||||||
length++;
|
length++;
|
||||||
}
|
}
|
||||||
JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&buf_iter));
|
JERRY_ASSERT (lit_utf8_iterator_is_eos (&buf_iter));
|
||||||
|
|
||||||
return length;
|
return length;
|
||||||
} /* lit_utf8_string_length */
|
} /* lit_utf8_string_length */
|
||||||
@ -375,13 +555,13 @@ lit_utf8_string_calc_hash_last_bytes (const lit_utf8_byte_t *utf8_buf_p, /**< ch
|
|||||||
{
|
{
|
||||||
JERRY_ASSERT (utf8_buf_p != NULL);
|
JERRY_ASSERT (utf8_buf_p != NULL);
|
||||||
|
|
||||||
lit_utf8_size_t byte1 = utf8_buf_size > 0 ? utf8_buf_p[utf8_buf_size - 1] : (lit_utf8_size_t) 0;
|
lit_utf8_byte_t byte1 = (utf8_buf_size > 0) ? utf8_buf_p[utf8_buf_size - 1] : 0;
|
||||||
lit_utf8_size_t byte2 = utf8_buf_size > 1 ? utf8_buf_p[utf8_buf_size - 2] : (lit_utf8_size_t) 0;
|
lit_utf8_byte_t byte2 = (utf8_buf_size > 1) ? utf8_buf_p[utf8_buf_size - 2] : 0;
|
||||||
|
|
||||||
lit_utf8_size_t t1 = byte1 + byte2;
|
uint32_t t1 = (uint32_t) byte1 + (uint32_t) byte2;
|
||||||
lit_utf8_size_t t2 = t1 * 0x24418b66;
|
uint32_t t2 = t1 * 0x24418b66;
|
||||||
lit_utf8_size_t t3 = (t2 >> 16) ^ (t2 & 0xffffu);
|
uint32_t t3 = (t2 >> 16) ^ (t2 & 0xffffu);
|
||||||
lit_utf8_size_t t4 = (t3 >> 8) ^ (t3 & 0xffu);
|
uint32_t t4 = (t3 >> 8) ^ (t3 & 0xffu);
|
||||||
|
|
||||||
return (lit_string_hash_t) t4;
|
return (lit_string_hash_t) t4;
|
||||||
} /* lit_utf8_string_calc_hash_last_bytes */
|
} /* lit_utf8_string_calc_hash_last_bytes */
|
||||||
@ -404,8 +584,8 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri
|
|||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter));
|
JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter));
|
||||||
code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter);
|
code_unit = lit_utf8_iterator_read_next (&iter);
|
||||||
}
|
}
|
||||||
while (code_unit_offset--);
|
while (code_unit_offset--);
|
||||||
|
|
||||||
@ -560,11 +740,11 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
|
|||||||
lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size);
|
lit_utf8_iterator_t iter1 = lit_utf8_iterator_create (string1_p, string1_size);
|
||||||
lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size);
|
lit_utf8_iterator_t iter2 = lit_utf8_iterator_create (string2_p, string2_size);
|
||||||
|
|
||||||
while (!lit_utf8_iterator_reached_buffer_end (&iter1)
|
while (!lit_utf8_iterator_is_eos (&iter1)
|
||||||
&& !lit_utf8_iterator_reached_buffer_end (&iter2))
|
&& !lit_utf8_iterator_is_eos (&iter2))
|
||||||
{
|
{
|
||||||
ecma_char_t code_point1 = lit_utf8_iterator_read_code_unit_and_increment (&iter1);
|
ecma_char_t code_point1 = lit_utf8_iterator_read_next (&iter1);
|
||||||
ecma_char_t code_point2 = lit_utf8_iterator_read_code_unit_and_increment (&iter2);
|
ecma_char_t code_point2 = lit_utf8_iterator_read_next (&iter2);
|
||||||
if (code_point1 < code_point2)
|
if (code_point1 < code_point2)
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
@ -575,5 +755,5 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (lit_utf8_iterator_reached_buffer_end (&iter1) && !lit_utf8_iterator_reached_buffer_end (&iter2));
|
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
|
||||||
} /* lit_compare_utf8_strings_relational */
|
} /* lit_compare_utf8_strings_relational */
|
||||||
|
|||||||
@ -25,16 +25,82 @@
|
|||||||
*/
|
*/
|
||||||
#define LIT_BYTE_NULL (0)
|
#define LIT_BYTE_NULL (0)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
||||||
|
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
|
||||||
|
*/
|
||||||
|
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
||||||
|
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
||||||
|
|
||||||
|
#define LIT_UTF16_CODE_UNIT_MAX (0xFFFF)
|
||||||
|
#define LIT_UTF16_FIRST_SURROGATE_CODE_POINT (0x10000)
|
||||||
|
#define LIT_UTF16_LOW_SURROGATE_MARKER (0xDC00)
|
||||||
|
#define LIT_UTF16_HIGH_SURROGATE_MARKER (0xD800)
|
||||||
|
#define LIT_UTF16_HIGH_SURROGATE_MIN (0xD800)
|
||||||
|
#define LIT_UTF16_HIGH_SURROGATE_MAX (0xDBFF)
|
||||||
|
#define LIT_UTF16_LOW_SURROGATE_MIN (0xDC00)
|
||||||
|
#define LIT_UTF16_LOW_SURROGATE_MAX (0xDFFF)
|
||||||
|
#define LIT_UTF16_BITS_IN_SURROGATE (10)
|
||||||
|
#define LIT_UTF16_LAST_10_BITS_MASK (0x3FF)
|
||||||
|
|
||||||
|
#define LIT_UTF8_1_BYTE_MARKER (0x00)
|
||||||
|
#define LIT_UTF8_2_BYTE_MARKER (0xC0)
|
||||||
|
#define LIT_UTF8_3_BYTE_MARKER (0xE0)
|
||||||
|
#define LIT_UTF8_4_BYTE_MARKER (0xF0)
|
||||||
|
#define LIT_UTF8_EXTRA_BYTE_MARKER (0x80)
|
||||||
|
|
||||||
|
#define LIT_UTF8_1_BYTE_MASK (0x80)
|
||||||
|
#define LIT_UTF8_2_BYTE_MASK (0xE0)
|
||||||
|
#define LIT_UTF8_3_BYTE_MASK (0xF0)
|
||||||
|
#define LIT_UTF8_4_BYTE_MASK (0xF8)
|
||||||
|
#define LIT_UTF8_EXTRA_BYTE_MASK (0xC0)
|
||||||
|
|
||||||
|
#define LIT_UTF8_LAST_7_BITS_MASK (0x7F)
|
||||||
|
#define LIT_UTF8_LAST_6_BITS_MASK (0x3F)
|
||||||
|
#define LIT_UTF8_LAST_5_BITS_MASK (0x1F)
|
||||||
|
#define LIT_UTF8_LAST_4_BITS_MASK (0x0F)
|
||||||
|
#define LIT_UTF8_LAST_3_BITS_MASK (0x07)
|
||||||
|
#define LIT_UTF8_LAST_2_BITS_MASK (0x03)
|
||||||
|
#define LIT_UTF8_LAST_1_BIT_MASK (0x01)
|
||||||
|
|
||||||
|
#define LIT_UTF8_BITS_IN_EXTRA_BYTES (6)
|
||||||
|
|
||||||
|
#define LIT_UTF8_1_BYTE_CODE_POINT_MAX (0x7F)
|
||||||
|
#define LIT_UTF8_2_BYTE_CODE_POINT_MIN (0x80)
|
||||||
|
#define LIT_UTF8_2_BYTE_CODE_POINT_MAX (0x7FF)
|
||||||
|
#define LIT_UTF8_3_BYTE_CODE_POINT_MIN (0x800)
|
||||||
|
#define LIT_UTF8_3_BYTE_CODE_POINT_MAX (LIT_UTF16_CODE_UNIT_MAX)
|
||||||
|
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
|
||||||
|
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Width of the offset field in lit_utf8_iterator_pos_t structure
|
||||||
|
*/
|
||||||
|
#define LIT_ITERATOR_OFFSET_WIDTH (31)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Iterator's offset field mask
|
||||||
|
*/
|
||||||
|
#define LIT_ITERATOR_OFFSET_MASK ((1ull << LIT_ITERATOR_OFFSET_WIDTH) - 1)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Represents position of the iterator
|
||||||
|
*/
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
lit_utf8_size_t offset : LIT_ITERATOR_OFFSET_WIDTH; /** offset to utf-8 char */
|
||||||
|
bool is_non_bmp_middle: 1; /** flag indicating that current position of the iterator is the middle of
|
||||||
|
* 4-byte char */
|
||||||
|
} lit_utf8_iterator_pos_t;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents an iterator over utf-8 buffer
|
* Represents an iterator over utf-8 buffer
|
||||||
*/
|
*/
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
lit_utf8_size_t buf_offset; /* current offset in the buffer */
|
|
||||||
lit_utf8_size_t buf_size; /* buffer length */
|
|
||||||
const lit_utf8_byte_t *buf_p; /* buffer */
|
const lit_utf8_byte_t *buf_p; /* buffer */
|
||||||
lit_code_point_t code_point; /* code point is saved here when processed Unicode character is higher than
|
lit_utf8_size_t buf_size; /* buffer length */
|
||||||
* 0xFFFF */
|
lit_utf8_iterator_pos_t buf_pos; /* position in the buffer */
|
||||||
} lit_utf8_iterator_t;
|
} lit_utf8_iterator_t;
|
||||||
|
|
||||||
/* validation */
|
/* validation */
|
||||||
@ -42,8 +108,27 @@ bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
|||||||
|
|
||||||
/* iteration */
|
/* iteration */
|
||||||
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||||
ecma_char_t lit_utf8_iterator_read_code_unit_and_increment (lit_utf8_iterator_t *);
|
|
||||||
bool lit_utf8_iterator_reached_buffer_end (const lit_utf8_iterator_t *);
|
void lit_utf8_iterator_seek_bos (lit_utf8_iterator_t *);
|
||||||
|
void lit_utf8_iterator_seek_eos (lit_utf8_iterator_t *);
|
||||||
|
|
||||||
|
lit_utf8_iterator_pos_t lit_utf8_iterator_get_pos (const lit_utf8_iterator_t *);
|
||||||
|
void lit_utf8_iterator_seek (lit_utf8_iterator_t *, lit_utf8_iterator_pos_t);
|
||||||
|
|
||||||
|
ecma_length_t lit_utf8_iterator_get_index (const lit_utf8_iterator_t *);
|
||||||
|
|
||||||
|
ecma_char_t lit_utf8_iterator_peek_next (const lit_utf8_iterator_t *);
|
||||||
|
ecma_char_t lit_utf8_iterator_peek_prev (const lit_utf8_iterator_t *);
|
||||||
|
|
||||||
|
void lit_utf8_iterator_incr (lit_utf8_iterator_t *);
|
||||||
|
void lit_utf8_iterator_decr (lit_utf8_iterator_t *);
|
||||||
|
void lit_utf8_iterator_advance (lit_utf8_iterator_t *, ecma_length_t);
|
||||||
|
|
||||||
|
ecma_char_t lit_utf8_iterator_read_next (lit_utf8_iterator_t *);
|
||||||
|
ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
|
||||||
|
|
||||||
|
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
|
||||||
|
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
|
||||||
|
|
||||||
/* size */
|
/* size */
|
||||||
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
||||||
|
|||||||
@ -19,10 +19,91 @@
|
|||||||
#include "test-common.h"
|
#include "test-common.h"
|
||||||
|
|
||||||
// Iterations count
|
// Iterations count
|
||||||
#define test_iters 64
|
#define test_iters (1024)
|
||||||
|
|
||||||
|
// Sub iterations count
|
||||||
|
#define test_subiters (128)
|
||||||
|
|
||||||
|
// Max bytes in string
|
||||||
|
#define max_bytes_in_string (16 * 1024)
|
||||||
|
#define max_code_units_in_string (max_bytes_in_string)
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
UTF8_ANY_SIZE,
|
||||||
|
UTF8_ONE_BYTE,
|
||||||
|
UTF8_TWO_BYTES,
|
||||||
|
UTF8_THREE_BYTES,
|
||||||
|
UTF8_FOUR_BYTES
|
||||||
|
} utf8_char_size;
|
||||||
|
|
||||||
|
static lit_utf8_size_t
|
||||||
|
generate_utf8_char (utf8_char_size char_size,
|
||||||
|
lit_utf8_byte_t *buf)
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (char_size >= 0 && char_size <= LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
|
||||||
|
lit_code_point_t code_point = (lit_code_point_t) rand ();
|
||||||
|
|
||||||
|
if (char_size == 1)
|
||||||
|
{
|
||||||
|
code_point %= LIT_UTF8_1_BYTE_CODE_POINT_MAX;
|
||||||
|
}
|
||||||
|
else if (char_size == 2)
|
||||||
|
{
|
||||||
|
code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_2_BYTE_CODE_POINT_MAX -
|
||||||
|
LIT_UTF8_2_BYTE_CODE_POINT_MIN);
|
||||||
|
}
|
||||||
|
else if (char_size == 3)
|
||||||
|
{
|
||||||
|
code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_3_BYTE_CODE_POINT_MAX -
|
||||||
|
LIT_UTF8_3_BYTE_CODE_POINT_MIN);
|
||||||
|
}
|
||||||
|
else if (char_size == 4)
|
||||||
|
{
|
||||||
|
code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN + code_point % (LIT_UTF8_4_BYTE_CODE_POINT_MAX -
|
||||||
|
LIT_UTF8_4_BYTE_CODE_POINT_MIN);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
code_point %= LIT_UTF8_4_BYTE_CODE_POINT_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
|
||||||
|
&& code_point <= LIT_UTF16_LOW_SURROGATE_MAX)
|
||||||
|
{
|
||||||
|
code_point = LIT_UTF16_HIGH_SURROGATE_MIN - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return lit_code_point_to_utf8 (code_point, buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ecma_length_t
|
||||||
|
generate_utf8_string (lit_utf8_byte_t *buf_p,
|
||||||
|
lit_utf8_size_t buf_size)
|
||||||
|
{
|
||||||
|
ecma_length_t length = 0;
|
||||||
|
|
||||||
|
lit_utf8_size_t size = 0;
|
||||||
|
while (size < buf_size)
|
||||||
|
{
|
||||||
|
const utf8_char_size char_size = (((buf_size - size) > LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
|
||||||
|
? UTF8_ANY_SIZE
|
||||||
|
: (utf8_char_size) (buf_size - size));
|
||||||
|
|
||||||
|
lit_utf8_size_t bytes_generated = generate_utf8_char (char_size, buf_p);
|
||||||
|
|
||||||
|
JERRY_ASSERT (lit_is_utf8_string_valid (buf_p, bytes_generated));
|
||||||
|
|
||||||
|
size += bytes_generated;
|
||||||
|
buf_p += bytes_generated;
|
||||||
|
length += (bytes_generated == LIT_UTF8_MAX_BYTES_IN_CODE_POINT) ? 2 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
JERRY_ASSERT (size == buf_size);
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
// Subiterations count
|
|
||||||
#define test_sub_iters 64
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main (int __attr_unused___ argc,
|
main (int __attr_unused___ argc,
|
||||||
@ -32,7 +113,74 @@ main (int __attr_unused___ argc,
|
|||||||
|
|
||||||
mem_init ();
|
mem_init ();
|
||||||
|
|
||||||
/* test lit_is_utf8_string_valid */
|
lit_utf8_byte_t utf8_string[max_bytes_in_string];
|
||||||
|
ecma_char_t code_units[max_code_units_in_string];
|
||||||
|
lit_utf8_iterator_pos_t saved_positions[max_code_units_in_string];
|
||||||
|
|
||||||
|
for (int i = 0; i < test_iters; i++)
|
||||||
|
{
|
||||||
|
lit_utf8_size_t utf8_string_size = (i == 0) ? 0 : (lit_utf8_size_t) (rand () % max_bytes_in_string);
|
||||||
|
ecma_length_t length = generate_utf8_string (utf8_string, utf8_string_size);
|
||||||
|
|
||||||
|
JERRY_ASSERT (lit_utf8_string_length (utf8_string, utf8_string_size) == length);
|
||||||
|
|
||||||
|
lit_utf8_iterator_t iter = lit_utf8_iterator_create (utf8_string, utf8_string_size);
|
||||||
|
ecma_length_t calculated_length = 0;
|
||||||
|
|
||||||
|
ecma_length_t code_units_count = 0;
|
||||||
|
while (!lit_utf8_iterator_is_eos (&iter))
|
||||||
|
{
|
||||||
|
code_units[code_units_count] = lit_utf8_iterator_peek_next (&iter);
|
||||||
|
saved_positions[code_units_count] = lit_utf8_iterator_get_pos (&iter);
|
||||||
|
code_units_count++;
|
||||||
|
calculated_length++;
|
||||||
|
|
||||||
|
lit_utf8_iterator_incr (&iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
JERRY_ASSERT (length == calculated_length);
|
||||||
|
|
||||||
|
if (code_units_count > 0)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < test_subiters; j++)
|
||||||
|
{
|
||||||
|
ecma_length_t index = (ecma_length_t) rand () % code_units_count;
|
||||||
|
lit_utf8_iterator_seek (&iter, saved_positions[index]);
|
||||||
|
JERRY_ASSERT (lit_utf8_iterator_peek_next (&iter) == code_units[index]);
|
||||||
|
JERRY_ASSERT (lit_utf8_iterator_get_index (&iter) == index);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lit_utf8_iterator_seek_eos (&iter);
|
||||||
|
while (!lit_utf8_iterator_is_bos (&iter))
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (code_units_count > 0);
|
||||||
|
calculated_length--;
|
||||||
|
JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_peek_prev (&iter));
|
||||||
|
lit_utf8_iterator_decr (&iter);
|
||||||
|
}
|
||||||
|
|
||||||
|
JERRY_ASSERT (calculated_length == 0);
|
||||||
|
|
||||||
|
while (!lit_utf8_iterator_is_eos (&iter))
|
||||||
|
{
|
||||||
|
ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter);
|
||||||
|
JERRY_ASSERT (code_unit == code_units[calculated_length]);
|
||||||
|
calculated_length++;
|
||||||
|
}
|
||||||
|
|
||||||
|
JERRY_ASSERT (length == calculated_length);
|
||||||
|
|
||||||
|
while (!lit_utf8_iterator_is_bos (&iter))
|
||||||
|
{
|
||||||
|
JERRY_ASSERT (code_units_count > 0);
|
||||||
|
calculated_length--;
|
||||||
|
JERRY_ASSERT (code_units[calculated_length] == lit_utf8_iterator_read_prev (&iter));
|
||||||
|
}
|
||||||
|
|
||||||
|
JERRY_ASSERT (calculated_length == 0);
|
||||||
|
}
|
||||||
|
|
||||||
/* Overlong-encoded code point */
|
/* Overlong-encoded code point */
|
||||||
lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82};
|
lit_utf8_byte_t invalid_utf8_string_1[] = {0xC0, 0x82};
|
||||||
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
JERRY_ASSERT (!lit_is_utf8_string_valid (invalid_utf8_string_1, sizeof (invalid_utf8_string_1)));
|
||||||
@ -53,14 +201,12 @@ main (int __attr_unused___ argc,
|
|||||||
lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0};
|
lit_utf8_byte_t valid_utf8_string_2[] = {0xF1, 0x90, 0x9F, 0xB0};
|
||||||
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2)));
|
JERRY_ASSERT (lit_is_utf8_string_valid (valid_utf8_string_2, sizeof (valid_utf8_string_2)));
|
||||||
|
|
||||||
/* test lit_read_code_point_from_utf8 */
|
|
||||||
lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88};
|
lit_utf8_byte_t buf[] = {0xF0, 0x90, 0x8D, 0x88};
|
||||||
lit_code_point_t code_point;
|
lit_code_point_t code_point;
|
||||||
lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point);
|
lit_utf8_size_t bytes_count = lit_read_code_point_from_utf8 (buf, sizeof (buf), &code_point);
|
||||||
JERRY_ASSERT (bytes_count == 4);
|
JERRY_ASSERT (bytes_count == 4);
|
||||||
JERRY_ASSERT (code_point == 0x10348);
|
JERRY_ASSERT (code_point == 0x10348);
|
||||||
|
|
||||||
/* test lit_code_unit_to_utf8 */
|
|
||||||
lit_utf8_byte_t res_buf[3];
|
lit_utf8_byte_t res_buf[3];
|
||||||
lit_utf8_size_t res_size;
|
lit_utf8_size_t res_size;
|
||||||
|
|
||||||
@ -79,14 +225,13 @@ main (int __attr_unused___ argc,
|
|||||||
JERRY_ASSERT (res_buf[1] == 0x9F);
|
JERRY_ASSERT (res_buf[1] == 0x9F);
|
||||||
JERRY_ASSERT (res_buf[2] == 0xBF);
|
JERRY_ASSERT (res_buf[2] == 0xBF);
|
||||||
|
|
||||||
/* test lit_utf8_iterator */
|
|
||||||
lit_utf8_byte_t bytes[] = {0xF0, 0x90, 0x8D, 0x88};
|
lit_utf8_byte_t bytes[] = {0xF0, 0x90, 0x8D, 0x88};
|
||||||
lit_utf8_iterator_t iter = lit_utf8_iterator_create (bytes, sizeof (bytes));
|
lit_utf8_iterator_t iter = lit_utf8_iterator_create (bytes, sizeof (bytes));
|
||||||
ecma_char_t code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter);
|
ecma_char_t code_unit = lit_utf8_iterator_read_next (&iter);
|
||||||
JERRY_ASSERT (!lit_utf8_iterator_reached_buffer_end (&iter));
|
JERRY_ASSERT (!lit_utf8_iterator_is_eos (&iter));
|
||||||
JERRY_ASSERT (code_unit == 0xD800);
|
JERRY_ASSERT (code_unit == 0xD800);
|
||||||
code_unit = lit_utf8_iterator_read_code_unit_and_increment (&iter);
|
code_unit = lit_utf8_iterator_read_next (&iter);
|
||||||
JERRY_ASSERT (lit_utf8_iterator_reached_buffer_end (&iter));
|
JERRY_ASSERT (lit_utf8_iterator_is_eos (&iter));
|
||||||
JERRY_ASSERT (code_unit == 0xDF48);
|
JERRY_ASSERT (code_unit == 0xDF48);
|
||||||
|
|
||||||
mem_finalize (true);
|
mem_finalize (true);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user