mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
The UTF8 buffer size can be smaller then the CESU8 string's size so the UTF8 output is may truncated. Therefore we cannot ensure that the CESU8 buffer is read until the end. This patch fixes #4920. JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik robert.fancsik@h-lab.eu
935 lines
29 KiB
C
935 lines
29 KiB
C
/* Copyright JS Foundation and other contributors, http://js.foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "lit-strings.h"
|
|
|
|
#include "jrt-libc-includes.h"
|
|
|
|
#define LIT_UTF8_SURROGATE_MARKER 0xed /**< utf8 surrogate marker */
|
|
#define LIT_UTF8_HIGH_SURROGATE_MIN 0xa0 /**< utf8 high surrogate minimum */
|
|
#define LIT_UTF8_HIGH_SURROGATE_MAX 0xaf /**< utf8 high surrogate maximum */
|
|
#define LIT_UTF8_LOW_SURROGATE_MIN 0xb0 /**< utf8 low surrogate minimum */
|
|
#define LIT_UTF8_LOW_SURROGATE_MAX 0xbf /**< utf8 low surrogate maximum */
|
|
#define LIT_UTF8_1_BYTE_MAX 0xf4 /**< utf8 one byte max */
|
|
#define LIT_UTF8_2_BYTE_MAX 0x8f /**< utf8 two byte max */
|
|
#define LIT_UTF8_VALID_TWO_BYTE_START 0xc2 /**< utf8 two byte start */
|
|
|
|
/**
|
|
* Validate utf-8 string
|
|
*
|
|
* NOTE:
|
|
* Isolated surrogates are allowed.
|
|
*
|
|
* @return true if utf-8 string is well-formed
|
|
* false otherwise
|
|
*/
|
|
bool
|
|
lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
|
lit_utf8_size_t buf_size, /**< string size */
|
|
bool is_strict) /**< true if surrogate pairs are not allowed */
|
|
{
|
|
const unsigned char *end = buf_size + utf8_buf_p;
|
|
|
|
const unsigned char *idx = (const unsigned char *) utf8_buf_p;
|
|
|
|
while (idx < end)
|
|
{
|
|
const uint8_t first_byte = *idx++;
|
|
|
|
if (first_byte < LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (first_byte < LIT_UTF8_VALID_TWO_BYTE_START || idx >= end)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
const uint8_t second_byte = *idx++;
|
|
|
|
if ((second_byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (first_byte < LIT_UTF8_3_BYTE_MARKER)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (idx >= end || (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (first_byte < LIT_UTF8_4_BYTE_MARKER)
|
|
{
|
|
if (first_byte == LIT_UTF8_3_BYTE_MARKER && (second_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (is_strict && first_byte == LIT_UTF8_SURROGATE_MARKER && second_byte >= LIT_UTF8_HIGH_SURROGATE_MIN
|
|
&& second_byte <= LIT_UTF8_HIGH_SURROGATE_MAX && idx + 3 <= end && idx[0] == LIT_UTF8_SURROGATE_MARKER
|
|
&& idx[1] >= LIT_UTF8_LOW_SURROGATE_MIN && idx[1] <= LIT_UTF8_LOW_SURROGATE_MAX)
|
|
{
|
|
return false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (idx >= end || first_byte > LIT_UTF8_1_BYTE_MAX
|
|
|| (first_byte == LIT_UTF8_4_BYTE_MARKER && second_byte <= LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
|| (first_byte == LIT_UTF8_1_BYTE_MAX && second_byte > LIT_UTF8_2_BYTE_MAX)
|
|
|| (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
} /* lit_is_valid_utf8_string */
|
|
|
|
/**
|
|
* Validate cesu-8 string
|
|
*
|
|
* @return true if cesu-8 string is well-formed
|
|
* false otherwise
|
|
*/
|
|
bool
|
|
lit_is_valid_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
|
|
lit_utf8_size_t buf_size) /**< string size */
|
|
{
|
|
lit_utf8_size_t idx = 0;
|
|
|
|
while (idx < buf_size)
|
|
{
|
|
lit_utf8_byte_t c = cesu8_buf_p[idx++];
|
|
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
lit_code_point_t code_point = 0;
|
|
lit_code_point_t min_code_point = 0;
|
|
lit_utf8_size_t extra_bytes_count;
|
|
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
|
{
|
|
extra_bytes_count = 1;
|
|
min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
|
|
code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
|
|
}
|
|
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
|
|
{
|
|
extra_bytes_count = 2;
|
|
min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
|
|
code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (idx + extra_bytes_count > buf_size)
|
|
{
|
|
/* cesu-8 string breaks in the middle */
|
|
return false;
|
|
}
|
|
|
|
for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
|
|
{
|
|
c = cesu8_buf_p[idx + offset];
|
|
if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
|
|
{
|
|
/* invalid continuation byte */
|
|
return false;
|
|
}
|
|
code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
|
|
}
|
|
|
|
if (code_point < min_code_point)
|
|
{
|
|
/* cesu-8 string doesn't encode valid unicode code point */
|
|
return false;
|
|
}
|
|
|
|
idx += extra_bytes_count;
|
|
}
|
|
|
|
return true;
|
|
} /* lit_is_valid_cesu8_string */
|
|
|
|
/**
|
|
* Check if the code point is UTF-16 low surrogate
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_is_code_point_utf16_low_surrogate (lit_code_point_t code_point) /**< code point */
|
|
{
|
|
return LIT_UTF16_LOW_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_LOW_SURROGATE_MAX;
|
|
} /* lit_is_code_point_utf16_low_surrogate */
|
|
|
|
/**
|
|
* Check if the code point is UTF-16 high surrogate
|
|
*
|
|
* @return true / false
|
|
*/
|
|
bool
|
|
lit_is_code_point_utf16_high_surrogate (lit_code_point_t code_point) /**< code point */
|
|
{
|
|
return LIT_UTF16_HIGH_SURROGATE_MIN <= code_point && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX;
|
|
} /* lit_is_code_point_utf16_high_surrogate */
|
|
|
|
/**
|
|
* Represents code point (>0xFFFF) as surrogate pair and returns its lower part
|
|
*
|
|
* @return lower code_unit of the surrogate pair
|
|
*/
|
|
static ecma_char_t
|
|
convert_code_point_to_low_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
|
|
{
|
|
JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
|
|
|
|
ecma_char_t code_unit_bits;
|
|
code_unit_bits = (ecma_char_t) (code_point & LIT_UTF16_LAST_10_BITS_MASK);
|
|
|
|
return (ecma_char_t) (LIT_UTF16_LOW_SURROGATE_MARKER | code_unit_bits);
|
|
} /* convert_code_point_to_low_surrogate */
|
|
|
|
/**
|
|
* Represents code point (>0xFFFF) as surrogate pair and returns its higher part
|
|
*
|
|
* @return higher code_unit of the surrogate pair
|
|
*/
|
|
static ecma_char_t
|
|
convert_code_point_to_high_surrogate (lit_code_point_t code_point) /**< code point, should be > 0xFFFF */
|
|
{
|
|
JERRY_ASSERT (code_point > LIT_UTF16_CODE_UNIT_MAX);
|
|
JERRY_ASSERT (code_point <= LIT_UNICODE_CODE_POINT_MAX);
|
|
|
|
ecma_char_t code_unit_bits;
|
|
code_unit_bits = (ecma_char_t) ((code_point - LIT_UTF16_FIRST_SURROGATE_CODE_POINT) >> LIT_UTF16_BITS_IN_SURROGATE);
|
|
|
|
return (LIT_UTF16_HIGH_SURROGATE_MARKER | code_unit_bits);
|
|
} /* convert_code_point_to_high_surrogate */
|
|
|
|
/**
|
|
* UTF16 Encoding method for a code point
|
|
*
|
|
* See also:
|
|
* ECMA-262 v6, 10.1.1
|
|
*
|
|
* @return uint8_t, the number of returning code points
|
|
*/
|
|
uint8_t
|
|
lit_utf16_encode_code_point (lit_code_point_t cp, /**< the code point we encode */
|
|
ecma_char_t *cu_p) /**< result of the encoding */
|
|
{
|
|
if (cp <= LIT_UTF16_CODE_UNIT_MAX)
|
|
{
|
|
cu_p[0] = (ecma_char_t) cp;
|
|
return 1;
|
|
}
|
|
|
|
cu_p[0] = convert_code_point_to_high_surrogate (cp);
|
|
cu_p[1] = convert_code_point_to_low_surrogate (cp);
|
|
return 2;
|
|
} /* lit_utf16_encode_code_point */
|
|
|
|
/**
|
|
* Calculate size of a zero-terminated utf-8 string
|
|
*
|
|
* NOTE:
|
|
* - string cannot be NULL
|
|
* - string should not contain zero characters in the middle
|
|
*
|
|
* @return size of a string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_zt_utf8_string_size (const lit_utf8_byte_t *utf8_str_p) /**< zero-terminated utf-8 string */
|
|
{
|
|
JERRY_ASSERT (utf8_str_p != NULL);
|
|
return (lit_utf8_size_t) strlen ((const char *) utf8_str_p);
|
|
} /* lit_zt_utf8_string_size */
|
|
|
|
/**
|
|
* Calculate length of a cesu-8 encoded string
|
|
*
|
|
* @return UTF-16 code units count
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_utf8_string_length (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
|
lit_utf8_size_t utf8_buf_size) /**< string size */
|
|
{
|
|
lit_utf8_size_t length = 0;
|
|
lit_utf8_size_t size = 0;
|
|
|
|
while (size < utf8_buf_size)
|
|
{
|
|
size += lit_get_unicode_char_size_by_utf8_first_byte (*(utf8_buf_p + size));
|
|
length++;
|
|
}
|
|
|
|
JERRY_ASSERT (size == utf8_buf_size);
|
|
|
|
return length;
|
|
} /* lit_utf8_string_length */
|
|
|
|
/**
|
|
* Calculate the required size of an utf-8 encoded string from cesu-8 encoded string
|
|
*
|
|
* @return size of an utf-8 encoded string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
|
|
lit_utf8_size_t cesu8_buf_size) /**< string size */
|
|
{
|
|
lit_utf8_size_t offset = 0;
|
|
lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
|
|
ecma_char_t prev_ch = 0;
|
|
|
|
while (offset < cesu8_buf_size)
|
|
{
|
|
ecma_char_t ch;
|
|
offset += lit_read_code_unit_from_cesu8 (cesu8_buf_p + offset, &ch);
|
|
|
|
if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
|
|
{
|
|
utf8_buf_size -= 2;
|
|
}
|
|
|
|
prev_ch = ch;
|
|
}
|
|
|
|
JERRY_ASSERT (offset == cesu8_buf_size);
|
|
|
|
return utf8_buf_size;
|
|
} /* lit_get_utf8_size_of_cesu8_string */
|
|
|
|
/**
|
|
* Calculate length of an utf-8 encoded string from cesu-8 encoded string
|
|
*
|
|
* @return length of an utf-8 encoded string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
|
|
lit_utf8_size_t cesu8_buf_size) /**< string size */
|
|
{
|
|
lit_utf8_size_t offset = 0;
|
|
lit_utf8_size_t utf8_length = 0;
|
|
ecma_char_t prev_ch = 0;
|
|
|
|
while (offset < cesu8_buf_size)
|
|
{
|
|
ecma_char_t ch;
|
|
offset += lit_read_code_unit_from_cesu8 (cesu8_buf_p + offset, &ch);
|
|
|
|
if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
|
|
{
|
|
utf8_length++;
|
|
}
|
|
|
|
prev_ch = ch;
|
|
}
|
|
|
|
JERRY_ASSERT (offset == cesu8_buf_size);
|
|
|
|
return utf8_length;
|
|
} /* lit_get_utf8_length_of_cesu8_string */
|
|
|
|
/**
|
|
* Decodes a unicode code point from non-empty utf-8-encoded buffer
|
|
*
|
|
* @return number of bytes occupied by code point in the string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_read_code_point_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
|
lit_utf8_size_t buf_size, /**< size of the buffer in bytes */
|
|
lit_code_point_t *code_point) /**< [out] code point */
|
|
{
|
|
JERRY_ASSERT (buf_p && buf_size);
|
|
|
|
lit_utf8_byte_t c = buf_p[0];
|
|
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
|
{
|
|
*code_point = (lit_code_point_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
|
|
return 1;
|
|
}
|
|
|
|
lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
|
|
lit_utf8_size_t bytes_count = 0;
|
|
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
|
{
|
|
bytes_count = 2;
|
|
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
|
|
}
|
|
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
|
|
{
|
|
bytes_count = 3;
|
|
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
|
|
}
|
|
else
|
|
{
|
|
JERRY_ASSERT ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER);
|
|
bytes_count = 4;
|
|
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
|
|
}
|
|
|
|
JERRY_ASSERT (buf_size >= bytes_count);
|
|
|
|
for (uint32_t i = 1; i < bytes_count; ++i)
|
|
{
|
|
ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
|
|
}
|
|
|
|
*code_point = ret;
|
|
return bytes_count;
|
|
} /* lit_read_code_point_from_utf8 */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return number of bytes occupied by code point in the string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_read_code_unit_from_cesu8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
|
ecma_char_t *code_unit) /**< [out] code unit */
|
|
{
|
|
JERRY_ASSERT (buf_p);
|
|
|
|
lit_utf8_byte_t c = buf_p[0];
|
|
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
|
{
|
|
*code_unit = (ecma_char_t) (c & LIT_UTF8_LAST_7_BITS_MASK);
|
|
return 1;
|
|
}
|
|
|
|
lit_code_point_t ret = LIT_UNICODE_CODE_POINT_NULL;
|
|
lit_utf8_size_t bytes_count;
|
|
|
|
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
|
{
|
|
bytes_count = 2;
|
|
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
|
|
}
|
|
else
|
|
{
|
|
JERRY_ASSERT ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
|
|
bytes_count = 3;
|
|
ret = ((lit_code_point_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
|
|
}
|
|
|
|
for (uint32_t i = 1; i < bytes_count; ++i)
|
|
{
|
|
ret <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
ret |= (buf_p[i] & LIT_UTF8_LAST_6_BITS_MASK);
|
|
}
|
|
|
|
JERRY_ASSERT (ret <= LIT_UTF16_CODE_UNIT_MAX);
|
|
*code_unit = (ecma_char_t) ret;
|
|
return bytes_count;
|
|
} /* lit_read_code_unit_from_cesu8 */
|
|
|
|
/**
|
|
* Decodes a unicode code point from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return number of bytes occupied by code point in the string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_read_code_point_from_cesu8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
|
const lit_utf8_byte_t *buf_end_p, /**< buffer end */
|
|
lit_code_point_t *code_point) /**< [out] code point */
|
|
{
|
|
ecma_char_t code_unit;
|
|
lit_utf8_size_t size = lit_read_code_unit_from_cesu8 (buf_p, &code_unit);
|
|
|
|
JERRY_ASSERT (buf_p + size <= buf_end_p);
|
|
|
|
if (lit_is_code_point_utf16_high_surrogate (code_unit))
|
|
{
|
|
buf_p += size;
|
|
|
|
if (buf_p < buf_end_p)
|
|
{
|
|
ecma_char_t next_code_unit;
|
|
lit_utf8_size_t next_size = lit_read_code_unit_from_cesu8 (buf_p, &next_code_unit);
|
|
|
|
if (lit_is_code_point_utf16_low_surrogate (next_code_unit))
|
|
{
|
|
JERRY_ASSERT (buf_p + next_size <= buf_end_p);
|
|
|
|
*code_point = lit_convert_surrogate_pair_to_code_point (code_unit, next_code_unit);
|
|
return size + next_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
*code_point = code_unit;
|
|
return size;
|
|
} /* lit_read_code_point_from_cesu8 */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return number of bytes occupied by code point in the string
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_read_prev_code_unit_from_utf8 (const lit_utf8_byte_t *buf_p, /**< buffer with characters */
|
|
ecma_char_t *code_point) /**< [out] code point */
|
|
{
|
|
JERRY_ASSERT (buf_p);
|
|
|
|
lit_utf8_decr (&buf_p);
|
|
return lit_read_code_unit_from_cesu8 (buf_p, code_point);
|
|
} /* lit_read_prev_code_unit_from_utf8 */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return next code unit
|
|
*/
|
|
ecma_char_t
|
|
lit_cesu8_read_next (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (*buf_p);
|
|
ecma_char_t ch;
|
|
|
|
*buf_p += lit_read_code_unit_from_cesu8 (*buf_p, &ch);
|
|
|
|
return ch;
|
|
} /* lit_cesu8_read_next */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return previous code unit
|
|
*/
|
|
ecma_char_t
|
|
lit_cesu8_read_prev (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (*buf_p);
|
|
ecma_char_t ch;
|
|
|
|
lit_utf8_decr (buf_p);
|
|
lit_read_code_unit_from_cesu8 (*buf_p, &ch);
|
|
|
|
return ch;
|
|
} /* lit_cesu8_read_prev */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return next code unit
|
|
*/
|
|
ecma_char_t JERRY_ATTR_NOINLINE
|
|
lit_cesu8_peek_next (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (buf_p != NULL);
|
|
ecma_char_t ch;
|
|
|
|
lit_read_code_unit_from_cesu8 (buf_p, &ch);
|
|
|
|
return ch;
|
|
} /* lit_cesu8_peek_next */
|
|
|
|
/**
|
|
* Decodes a unicode code unit from non-empty cesu-8-encoded buffer
|
|
*
|
|
* @return previous code unit
|
|
*/
|
|
ecma_char_t JERRY_ATTR_NOINLINE
|
|
lit_cesu8_peek_prev (const lit_utf8_byte_t *buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (buf_p != NULL);
|
|
ecma_char_t ch;
|
|
|
|
lit_read_prev_code_unit_from_utf8 (buf_p, &ch);
|
|
|
|
return ch;
|
|
} /* lit_cesu8_peek_prev */
|
|
|
|
/**
|
|
* Increase cesu-8 encoded string pointer by one code unit.
|
|
*/
|
|
extern inline void JERRY_ATTR_ALWAYS_INLINE
|
|
lit_utf8_incr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (*buf_p);
|
|
|
|
*buf_p += lit_get_unicode_char_size_by_utf8_first_byte (**buf_p);
|
|
} /* lit_utf8_incr */
|
|
|
|
/**
|
|
* Decrease cesu-8 encoded string pointer by one code unit.
|
|
*/
|
|
void
|
|
lit_utf8_decr (const lit_utf8_byte_t **buf_p) /**< [in,out] buffer with characters */
|
|
{
|
|
JERRY_ASSERT (*buf_p);
|
|
const lit_utf8_byte_t *current_p = *buf_p;
|
|
|
|
do
|
|
{
|
|
current_p--;
|
|
} while ((*current_p & LIT_UTF8_EXTRA_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER);
|
|
|
|
*buf_p = current_p;
|
|
} /* lit_utf8_decr */
|
|
|
|
/**
|
|
* Calc hash using the specified hash_basis.
|
|
*
|
|
* NOTE:
|
|
* This is implementation of FNV-1a hash function, which is released into public domain.
|
|
* Constants used, are carefully picked primes by the authors.
|
|
* More info: http://www.isthe.com/chongo/tech/comp/fnv/
|
|
*
|
|
* @return ecma-string's hash
|
|
*/
|
|
extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
|
|
lit_utf8_string_hash_combine (lit_string_hash_t hash_basis, /**< hash to be combined with */
|
|
const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
|
|
lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
|
|
{
|
|
JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
|
|
|
|
uint32_t hash = hash_basis;
|
|
|
|
for (uint32_t i = 0; i < utf8_buf_size; i++)
|
|
{
|
|
/* 16777619 is 32 bit FNV_prime = 2^24 + 2^8 + 0x93 = 16777619 */
|
|
hash = (hash ^ utf8_buf_p[i]) * 16777619;
|
|
}
|
|
|
|
return (lit_string_hash_t) hash;
|
|
} /* lit_utf8_string_hash_combine */
|
|
|
|
/**
|
|
* Calculate hash from the buffer.
|
|
*
|
|
* @return ecma-string's hash
|
|
*/
|
|
extern inline lit_string_hash_t JERRY_ATTR_ALWAYS_INLINE
|
|
lit_utf8_string_calc_hash (const lit_utf8_byte_t *utf8_buf_p, /**< characters buffer */
|
|
lit_utf8_size_t utf8_buf_size) /**< number of characters in the buffer */
|
|
{
|
|
JERRY_ASSERT (utf8_buf_p != NULL || utf8_buf_size == 0);
|
|
|
|
/* 32 bit offset_basis for FNV = 2166136261 */
|
|
return lit_utf8_string_hash_combine ((lit_string_hash_t) 2166136261, utf8_buf_p, utf8_buf_size);
|
|
} /* lit_utf8_string_calc_hash */
|
|
|
|
/**
|
|
* Return code unit at the specified position in string
|
|
*
|
|
* NOTE:
|
|
* code_unit_offset should be less then string's length
|
|
*
|
|
* @return code unit value
|
|
*/
|
|
ecma_char_t
|
|
lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string */
|
|
lit_utf8_size_t utf8_buf_size, /**< string size in bytes */
|
|
lit_utf8_size_t code_unit_offset) /**< ofset of a code_unit */
|
|
{
|
|
lit_utf8_byte_t *current_p = (lit_utf8_byte_t *) utf8_buf_p;
|
|
ecma_char_t code_unit;
|
|
|
|
do
|
|
{
|
|
JERRY_ASSERT (current_p < utf8_buf_p + utf8_buf_size);
|
|
current_p += lit_read_code_unit_from_cesu8 (current_p, &code_unit);
|
|
} while (code_unit_offset--);
|
|
|
|
return code_unit;
|
|
} /* lit_utf8_string_code_unit_at */
|
|
|
|
/**
|
|
* Get CESU-8 encoded size of character
|
|
*
|
|
* @return number of bytes occupied in CESU-8
|
|
*/
|
|
extern inline lit_utf8_size_t JERRY_ATTR_ALWAYS_INLINE
|
|
lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */
|
|
{
|
|
if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
|
|
{
|
|
return 1;
|
|
}
|
|
else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
|
|
{
|
|
return 2;
|
|
}
|
|
else
|
|
{
|
|
JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER);
|
|
return 3;
|
|
}
|
|
} /* lit_get_unicode_char_size_by_utf8_first_byte */
|
|
|
|
/**
|
|
* Convert code unit to cesu-8 representation
|
|
*
|
|
* @return byte count required to represent the code unit
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_code_unit_to_utf8 (ecma_char_t code_unit, /**< code unit */
|
|
lit_utf8_byte_t *buf_p) /**< buffer where to store the result and its size
|
|
* should be at least LIT_UTF8_MAX_BYTES_IN_CODE_UNIT */
|
|
{
|
|
if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
buf_p[0] = (lit_utf8_byte_t) code_unit;
|
|
return 1;
|
|
}
|
|
else if (code_unit <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
|
|
{
|
|
uint32_t code_unit_bits = code_unit;
|
|
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_5_BITS_MASK);
|
|
JERRY_ASSERT (first_byte_bits == code_unit_bits);
|
|
|
|
buf_p[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
|
|
buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
|
return 2;
|
|
}
|
|
else
|
|
{
|
|
uint32_t code_unit_bits = code_unit;
|
|
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_unit_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_unit_bits & LIT_UTF8_LAST_4_BITS_MASK);
|
|
JERRY_ASSERT (first_byte_bits == code_unit_bits);
|
|
|
|
buf_p[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
|
|
buf_p[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
|
buf_p[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
|
|
return 3;
|
|
}
|
|
} /* lit_code_unit_to_utf8 */
|
|
|
|
/**
|
|
* Convert code point to cesu-8 representation
|
|
*
|
|
* @return byte count required to represent the code point
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_code_point_to_cesu8 (lit_code_point_t code_point, /**< code point */
|
|
lit_utf8_byte_t *buf) /**< buffer where to store the result,
|
|
* its size should be at least 6 bytes */
|
|
{
|
|
if (code_point <= LIT_UTF16_CODE_UNIT_MAX)
|
|
{
|
|
return lit_code_unit_to_utf8 ((ecma_char_t) code_point, buf);
|
|
}
|
|
else
|
|
{
|
|
lit_utf8_size_t offset = lit_code_unit_to_utf8 (convert_code_point_to_high_surrogate (code_point), buf);
|
|
offset += lit_code_unit_to_utf8 (convert_code_point_to_low_surrogate (code_point), buf + offset);
|
|
return offset;
|
|
}
|
|
} /* lit_code_point_to_cesu8 */
|
|
|
|
/**
|
|
* Convert code point to utf-8 representation
|
|
*
|
|
* @return byte count required to represent the code point
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
|
|
lit_utf8_byte_t *buf) /**< buffer where to store the result,
|
|
* its size should be at least 4 bytes */
|
|
{
|
|
if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
|
{
|
|
buf[0] = (lit_utf8_byte_t) code_point;
|
|
return 1;
|
|
}
|
|
else if (code_point <= LIT_UTF8_2_BYTE_CODE_POINT_MAX)
|
|
{
|
|
uint32_t code_point_bits = code_point;
|
|
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_5_BITS_MASK);
|
|
JERRY_ASSERT (first_byte_bits == code_point_bits);
|
|
|
|
buf[0] = LIT_UTF8_2_BYTE_MARKER | first_byte_bits;
|
|
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
|
return 2;
|
|
}
|
|
else if (code_point <= LIT_UTF8_3_BYTE_CODE_POINT_MAX)
|
|
{
|
|
uint32_t code_point_bits = code_point;
|
|
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_4_BITS_MASK);
|
|
JERRY_ASSERT (first_byte_bits == code_point_bits);
|
|
|
|
buf[0] = LIT_UTF8_3_BYTE_MARKER | first_byte_bits;
|
|
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
|
buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
|
|
return 3;
|
|
}
|
|
else
|
|
{
|
|
JERRY_ASSERT (code_point <= LIT_UTF8_4_BYTE_CODE_POINT_MAX);
|
|
|
|
uint32_t code_point_bits = code_point;
|
|
lit_utf8_byte_t fourth_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t third_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t second_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_6_BITS_MASK);
|
|
code_point_bits >>= LIT_UTF8_BITS_IN_EXTRA_BYTES;
|
|
|
|
lit_utf8_byte_t first_byte_bits = (lit_utf8_byte_t) (code_point_bits & LIT_UTF8_LAST_3_BITS_MASK);
|
|
JERRY_ASSERT (first_byte_bits == code_point_bits);
|
|
|
|
buf[0] = LIT_UTF8_4_BYTE_MARKER | first_byte_bits;
|
|
buf[1] = LIT_UTF8_EXTRA_BYTE_MARKER | second_byte_bits;
|
|
buf[2] = LIT_UTF8_EXTRA_BYTE_MARKER | third_byte_bits;
|
|
buf[3] = LIT_UTF8_EXTRA_BYTE_MARKER | fourth_byte_bits;
|
|
return 4;
|
|
}
|
|
} /* lit_code_point_to_utf8 */
|
|
|
|
/**
|
|
* Convert cesu-8 string to an utf-8 string and put it into the buffer.
|
|
* String will be truncated to fit the buffer.
|
|
*
|
|
* @return number of bytes copied to the buffer.
|
|
*/
|
|
lit_utf8_size_t
|
|
lit_convert_cesu8_string_to_utf8_string (const lit_utf8_byte_t *cesu8_string_p, /**< cesu-8 string */
|
|
lit_utf8_size_t cesu8_size, /**< size of cesu-8 string */
|
|
lit_utf8_byte_t *utf8_string_p, /**< destination utf-8 buffer pointer
|
|
* (can be NULL if buffer_size == 0) */
|
|
lit_utf8_size_t utf8_size) /**< size of utf-8 buffer */
|
|
{
|
|
const lit_utf8_byte_t *cesu8_cursor_p = cesu8_string_p;
|
|
const lit_utf8_byte_t *cesu8_end_p = cesu8_string_p + cesu8_size;
|
|
|
|
lit_utf8_byte_t *utf8_cursor_p = utf8_string_p;
|
|
lit_utf8_byte_t *utf8_end_p = utf8_string_p + utf8_size;
|
|
|
|
while (cesu8_cursor_p < cesu8_end_p)
|
|
{
|
|
lit_code_point_t cp;
|
|
lit_utf8_size_t read_size = lit_read_code_point_from_cesu8 (cesu8_cursor_p, cesu8_end_p, &cp);
|
|
lit_utf8_size_t encoded_size = (cp >= LIT_UTF16_FIRST_SURROGATE_CODE_POINT) ? 4 : read_size;
|
|
|
|
if (utf8_cursor_p + encoded_size > utf8_end_p)
|
|
{
|
|
break;
|
|
}
|
|
|
|
if (cp >= LIT_UTF16_FIRST_SURROGATE_CODE_POINT)
|
|
{
|
|
lit_code_point_to_utf8 (cp, utf8_cursor_p);
|
|
}
|
|
else
|
|
{
|
|
memcpy (utf8_cursor_p, cesu8_cursor_p, encoded_size);
|
|
}
|
|
|
|
utf8_cursor_p += encoded_size;
|
|
cesu8_cursor_p += read_size;
|
|
}
|
|
|
|
JERRY_ASSERT (cesu8_cursor_p <= cesu8_end_p);
|
|
JERRY_ASSERT (utf8_cursor_p <= utf8_end_p);
|
|
|
|
return (lit_utf8_byte_t) (utf8_cursor_p - utf8_string_p);
|
|
} /* lit_convert_cesu8_string_to_utf8_string */
|
|
|
|
/**
|
|
* Convert surrogate pair to code point
|
|
*
|
|
* @return code point
|
|
*/
|
|
lit_code_point_t
|
|
lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
|
|
ecma_char_t low_surrogate) /**< low surrogate code point */
|
|
{
|
|
JERRY_ASSERT (lit_is_code_point_utf16_high_surrogate (high_surrogate));
|
|
JERRY_ASSERT (lit_is_code_point_utf16_low_surrogate (low_surrogate));
|
|
|
|
lit_code_point_t code_point;
|
|
code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
|
|
code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
|
|
|
|
code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
|
|
|
|
code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
|
|
return code_point;
|
|
} /* lit_convert_surrogate_pair_to_code_point */
|
|
|
|
/**
|
|
* Relational compare of cesu-8 strings
|
|
*
|
|
* First string is less than second string if:
|
|
* - strings are not equal;
|
|
* - first string is prefix of second or is lexicographically less than second.
|
|
*
|
|
* @return true - if first string is less than second string,
|
|
* false - otherwise
|
|
*/
|
|
bool
|
|
lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< utf-8 string */
|
|
lit_utf8_size_t string1_size, /**< string size */
|
|
const lit_utf8_byte_t *string2_p, /**< utf-8 string */
|
|
lit_utf8_size_t string2_size) /**< string size */
|
|
{
|
|
lit_utf8_byte_t *string1_pos = (lit_utf8_byte_t *) string1_p;
|
|
lit_utf8_byte_t *string2_pos = (lit_utf8_byte_t *) string2_p;
|
|
const lit_utf8_byte_t *string1_end_p = string1_p + string1_size;
|
|
const lit_utf8_byte_t *string2_end_p = string2_p + string2_size;
|
|
|
|
while (string1_pos < string1_end_p && string2_pos < string2_end_p)
|
|
{
|
|
ecma_char_t ch1, ch2;
|
|
string1_pos += lit_read_code_unit_from_cesu8 (string1_pos, &ch1);
|
|
string2_pos += lit_read_code_unit_from_cesu8 (string2_pos, &ch2);
|
|
|
|
if (ch1 < ch2)
|
|
{
|
|
return true;
|
|
}
|
|
else if (ch1 > ch2)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return (string1_pos >= string1_end_p && string2_pos < string2_end_p);
|
|
} /* lit_compare_utf8_strings_relational */
|