mirror of
https://github.com/jerryscript-project/jerryscript.git
synced 2025-12-15 16:29:21 +00:00
Add helper functions for implementing unicode support in lexer.
JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
parent
c21399cd58
commit
d248d0944c
@ -24,6 +24,7 @@
|
||||
#include "ecma-alloc.h"
|
||||
#include "ecma-helpers.h"
|
||||
#include "ecma-builtin-helpers.h"
|
||||
#include "lit-char-helpers.h"
|
||||
|
||||
#define LIST_BLOCK_SIZE 256UL
|
||||
|
||||
|
||||
@ -28,6 +28,7 @@
|
||||
#include "ecma-try-catch-macro.h"
|
||||
#include "jrt.h"
|
||||
#include "jrt-libc-includes.h"
|
||||
#include "lit-char-helpers.h"
|
||||
|
||||
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN
|
||||
|
||||
|
||||
@ -594,7 +594,7 @@ lit_charset_record_get_length (literal_t lit) /**< literal */
|
||||
lit_iter.skip (bytes_to_skip);
|
||||
i += bytes_to_skip;
|
||||
|
||||
length++;
|
||||
length += (bytes_to_skip > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) ? 2 : 1;
|
||||
}
|
||||
|
||||
#ifndef JERRY_NDEBUG
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
|
||||
#include "jrt-libc-includes.h"
|
||||
|
||||
JERRY_STATIC_ASSERT (sizeof (lit_utf8_iterator_pos_t) == sizeof (lit_utf8_size_t));
|
||||
|
||||
/**
|
||||
* Validate utf-8 string
|
||||
*
|
||||
@ -117,6 +119,28 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
|
||||
return true;
|
||||
} /* lit_is_utf8_string_valid */
|
||||
|
||||
/**
|
||||
* Check if the code unit type is low surrogate
|
||||
*
|
||||
* @return true / false
|
||||
*/
|
||||
bool
|
||||
lit_is_code_unit_low_surrogate (ecma_char_t code_unit) /**< code unit */
|
||||
{
|
||||
return LIT_UTF16_LOW_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_LOW_SURROGATE_MAX;
|
||||
} /* lit_is_code_unit_low_surrogate */
|
||||
|
||||
/**
|
||||
* Check if the code unit type is high surrogate
|
||||
*
|
||||
* @return true / false
|
||||
*/
|
||||
bool
|
||||
lit_is_code_unit_high_surrogate (ecma_char_t code_unit) /**< code unit */
|
||||
{
|
||||
return LIT_UTF16_HIGH_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_HIGH_SURROGATE_MAX;
|
||||
} /* lit_is_code_unit_high_surrogate */
|
||||
|
||||
/**
|
||||
* Initialize iterator for traversing utf-8 string as a string of code units
|
||||
*
|
||||
@ -455,6 +479,48 @@ lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
|
||||
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
|
||||
} /* lit_utf8_iterator_is_bos */
|
||||
|
||||
/**
|
||||
* Get offset of the iterator
|
||||
*
|
||||
* @return: current offset in bytes of the iterator from the beginning of buffer
|
||||
*/
|
||||
lit_utf8_size_t
|
||||
lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *iter_p) /**< iterator */
|
||||
{
|
||||
return iter_p->buf_pos.offset;
|
||||
} /* lit_utf8_iterator_get_offset */
|
||||
|
||||
/**
|
||||
* Set iterator to point to specified offset
|
||||
*/
|
||||
void
|
||||
lit_utf8_iterator_set_offset (lit_utf8_iterator_t *iter_p, /**< pointer to iterator */
|
||||
lit_utf8_size_t offset) /**< offset from the begging of the iterated buffer */
|
||||
{
|
||||
JERRY_ASSERT (offset <= iter_p->buf_size);
|
||||
|
||||
#ifndef JERRY_NDEBUG
|
||||
if (offset < iter_p->buf_size)
|
||||
{
|
||||
JERRY_ASSERT (((*(iter_p->buf_p + offset)) & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
|
||||
}
|
||||
#endif
|
||||
|
||||
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
|
||||
iter_p->buf_pos.is_non_bmp_middle = false;
|
||||
} /* lit_utf8_iterator_set_offset */
|
||||
|
||||
/**
|
||||
* Get pointer to the current utf-8 char which iterator points to
|
||||
*
|
||||
* @return: pointer to utf-8 char
|
||||
*/
|
||||
lit_utf8_byte_t *
|
||||
lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *iter_p) /**< iterator */
|
||||
{
|
||||
return (lit_utf8_byte_t *) iter_p->buf_p + iter_p->buf_pos.offset;
|
||||
} /* lit_utf8_iterator_get_ptr */
|
||||
|
||||
/**
|
||||
* Calculate size of a zero-terminated utf-8 string
|
||||
*
|
||||
@ -702,6 +768,28 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
|
||||
}
|
||||
} /* lit_code_unit_to_utf8 */
|
||||
|
||||
/**
|
||||
* Convert surrogate pair to code point
|
||||
*
|
||||
* @return code point
|
||||
*/
|
||||
lit_code_point_t
|
||||
lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
|
||||
ecma_char_t low_surrogate) /**< low surrogate code point */
|
||||
{
|
||||
JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
|
||||
JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
|
||||
|
||||
lit_code_point_t code_point;
|
||||
code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
|
||||
code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
|
||||
|
||||
code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
|
||||
|
||||
code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
|
||||
return code_point;
|
||||
} /* lit_surrogate_pair_to_code_point */
|
||||
|
||||
/**
|
||||
* Compare utf-8 string to utf-8 string
|
||||
*
|
||||
@ -757,3 +845,20 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
|
||||
|
||||
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
|
||||
} /* lit_compare_utf8_strings_relational */
|
||||
|
||||
/**
|
||||
* Print code unit to standard output
|
||||
*/
|
||||
void
|
||||
lit_put_ecma_char (ecma_char_t ecma_char) /**< code unit */
|
||||
{
|
||||
if (ecma_char <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
|
||||
{
|
||||
putchar (ecma_char);
|
||||
}
|
||||
else
|
||||
{
|
||||
FIXME ("Support unicode characters printing.");
|
||||
putchar ('_');
|
||||
}
|
||||
} /* lit_put_ecma_char */
|
||||
|
||||
@ -17,7 +17,6 @@
|
||||
#define LIT_UNICODE_HELPERS_H
|
||||
|
||||
#include "jrt.h"
|
||||
#include "lit-char-helpers.h"
|
||||
#include "lit-globals.h"
|
||||
|
||||
/**
|
||||
@ -27,7 +26,7 @@
|
||||
|
||||
/**
|
||||
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
|
||||
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
|
||||
* Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
|
||||
*/
|
||||
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
|
||||
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
|
||||
@ -112,6 +111,10 @@ typedef struct
|
||||
/* validation */
|
||||
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
|
||||
/* checks */
|
||||
bool lit_is_code_unit_low_surrogate (ecma_char_t);
|
||||
bool lit_is_code_unit_high_surrogate (ecma_char_t);
|
||||
|
||||
/* iteration */
|
||||
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
|
||||
|
||||
@ -136,6 +139,11 @@ ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
|
||||
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
|
||||
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
|
||||
|
||||
lit_utf8_size_t lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *);
|
||||
void lit_utf8_iterator_set_offset (lit_utf8_iterator_t *, lit_utf8_size_t);
|
||||
|
||||
lit_utf8_byte_t *lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *);
|
||||
|
||||
/* size */
|
||||
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
|
||||
|
||||
@ -152,6 +160,7 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
|
||||
/* conversion */
|
||||
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
|
||||
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
|
||||
lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t);
|
||||
|
||||
/* comparison */
|
||||
bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
|
||||
@ -169,4 +178,7 @@ lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *,
|
||||
lit_utf8_size_t,
|
||||
lit_code_point_t *);
|
||||
|
||||
/* print */
|
||||
void lit_put_ecma_char (ecma_char_t);
|
||||
|
||||
#endif /* LIT_UNICODE_HELPERS_H */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user