Add helper functions for implementing unicode support in lexer.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov 2015-07-03 21:41:03 +03:00 committed by Ruben Ayrapetyan
parent c21399cd58
commit d248d0944c
5 changed files with 122 additions and 3 deletions

View File

@ -24,6 +24,7 @@
#include "ecma-alloc.h"
#include "ecma-helpers.h"
#include "ecma-builtin-helpers.h"
#include "lit-char-helpers.h"
#define LIST_BLOCK_SIZE 256UL

View File

@ -28,6 +28,7 @@
#include "ecma-try-catch-macro.h"
#include "jrt.h"
#include "jrt-libc-includes.h"
#include "lit-char-helpers.h"
#ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN

View File

@ -594,7 +594,7 @@ lit_charset_record_get_length (literal_t lit) /**< literal */
lit_iter.skip (bytes_to_skip);
i += bytes_to_skip;
length++;
length += (bytes_to_skip > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) ? 2 : 1;
}
#ifndef JERRY_NDEBUG

View File

@ -17,6 +17,8 @@
#include "jrt-libc-includes.h"
JERRY_STATIC_ASSERT (sizeof (lit_utf8_iterator_pos_t) == sizeof (lit_utf8_size_t));
/**
* Validate utf-8 string
*
@ -117,6 +119,28 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
return true;
} /* lit_is_utf8_string_valid */
/**
* Check if the code unit type is low surrogate
*
* @return true / false
*/
bool
lit_is_code_unit_low_surrogate (ecma_char_t code_unit) /**< code unit */
{
return LIT_UTF16_LOW_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_LOW_SURROGATE_MAX;
} /* lit_is_code_unit_low_surrogate */
/**
* Check if the code unit type is high surrogate
*
* @return true / false
*/
bool
lit_is_code_unit_high_surrogate (ecma_char_t code_unit) /**< code unit */
{
return LIT_UTF16_HIGH_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_HIGH_SURROGATE_MAX;
} /* lit_is_code_unit_high_surrogate */
/**
* Initialize iterator for traversing utf-8 string as a string of code units
*
@ -455,6 +479,48 @@ lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
} /* lit_utf8_iterator_is_bos */
/**
* Get offset of the iterator
*
* @return: current offset in bytes of the iterator from the beginning of buffer
*/
lit_utf8_size_t
lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *iter_p) /**< iterator */
{
return iter_p->buf_pos.offset;
} /* lit_utf8_iterator_get_offset */
/**
* Set iterator to point to specified offset
*/
void
lit_utf8_iterator_set_offset (lit_utf8_iterator_t *iter_p, /**< pointer to iterator */
lit_utf8_size_t offset) /**< offset from the begging of the iterated buffer */
{
JERRY_ASSERT (offset <= iter_p->buf_size);
#ifndef JERRY_NDEBUG
if (offset < iter_p->buf_size)
{
JERRY_ASSERT (((*(iter_p->buf_p + offset)) & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
}
#endif
iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
iter_p->buf_pos.is_non_bmp_middle = false;
} /* lit_utf8_iterator_set_offset */
/**
* Get pointer to the current utf-8 char which iterator points to
*
* @return: pointer to utf-8 char
*/
lit_utf8_byte_t *
lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *iter_p) /**< iterator */
{
return (lit_utf8_byte_t *) iter_p->buf_p + iter_p->buf_pos.offset;
} /* lit_utf8_iterator_get_ptr */
/**
* Calculate size of a zero-terminated utf-8 string
*
@ -702,6 +768,28 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
}
} /* lit_code_unit_to_utf8 */
/**
* Convert surrogate pair to code point
*
* @return code point
*/
lit_code_point_t
lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
ecma_char_t low_surrogate) /**< low surrogate code point */
{
JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
lit_code_point_t code_point;
code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
return code_point;
} /* lit_surrogate_pair_to_code_point */
/**
* Compare utf-8 string to utf-8 string
*
@ -757,3 +845,20 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<
return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
} /* lit_compare_utf8_strings_relational */
/**
* Print code unit to standard output
*/
void
lit_put_ecma_char (ecma_char_t ecma_char) /**< code unit */
{
if (ecma_char <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
{
putchar (ecma_char);
}
else
{
FIXME ("Support unicode characters printing.");
putchar ('_');
}
} /* lit_put_ecma_char */

View File

@ -17,7 +17,6 @@
#define LIT_UNICODE_HELPERS_H
#include "jrt.h"
#include "lit-char-helpers.h"
#include "lit-globals.h"
/**
@ -27,7 +26,7 @@
/**
* For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
* Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
* Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
*/
#define LIT_UNICODE_CODE_POINT_NULL (0x0)
#define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
@ -112,6 +111,10 @@ typedef struct
/* validation */
bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);
/* checks */
bool lit_is_code_unit_low_surrogate (ecma_char_t);
bool lit_is_code_unit_high_surrogate (ecma_char_t);
/* iteration */
lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);
@ -136,6 +139,11 @@ ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);
lit_utf8_size_t lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *);
void lit_utf8_iterator_set_offset (lit_utf8_iterator_t *, lit_utf8_size_t);
lit_utf8_byte_t *lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *);
/* size */
lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);
@ -152,6 +160,7 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
/* conversion */
lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t);
/* comparison */
bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
@ -169,4 +178,7 @@ lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *,
lit_utf8_size_t,
lit_code_point_t *);
/* print */
void lit_put_ecma_char (ecma_char_t);
#endif /* LIT_UNICODE_HELPERS_H */