From d248d0944cdd673ab90160275ed1e8652de9e3c0 Mon Sep 17 00:00:00 2001 From: Andrey Shitov Date: Fri, 3 Jul 2015 21:41:03 +0300 Subject: [PATCH] Add helper functions for implementing unicode support in lexer. JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com --- .../ecma-builtin-helpers-json.cpp | 1 + .../builtin-objects/ecma-builtin-json.cpp | 1 + jerry-core/lit/lit-literal.cpp | 2 +- jerry-core/lit/lit-strings.cpp | 105 ++++++++++++++++++ jerry-core/lit/lit-strings.h | 16 ++- 5 files changed, 122 insertions(+), 3 deletions(-) diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp index f650f360e..c05b8cd56 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp @@ -24,6 +24,7 @@ #include "ecma-alloc.h" #include "ecma-helpers.h" #include "ecma-builtin-helpers.h" +#include "lit-char-helpers.h" #define LIST_BLOCK_SIZE 256UL diff --git a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp index d34717d75..3420096f2 100644 --- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp +++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp @@ -28,6 +28,7 @@ #include "ecma-try-catch-macro.h" #include "jrt.h" #include "jrt-libc-includes.h" +#include "lit-char-helpers.h" #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN diff --git a/jerry-core/lit/lit-literal.cpp b/jerry-core/lit/lit-literal.cpp index abc9a5ea4..b96a2658b 100644 --- a/jerry-core/lit/lit-literal.cpp +++ b/jerry-core/lit/lit-literal.cpp @@ -594,7 +594,7 @@ lit_charset_record_get_length (literal_t lit) /**< literal */ lit_iter.skip (bytes_to_skip); i += bytes_to_skip; - length++; + length += (bytes_to_skip > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) ? 2 : 1; } #ifndef JERRY_NDEBUG diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp index 609db1e6e..a1227b136 100644 --- a/jerry-core/lit/lit-strings.cpp +++ b/jerry-core/lit/lit-strings.cpp @@ -17,6 +17,8 @@ #include "jrt-libc-includes.h" +JERRY_STATIC_ASSERT (sizeof (lit_utf8_iterator_pos_t) == sizeof (lit_utf8_size_t)); + /** * Validate utf-8 string * @@ -117,6 +119,28 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string * return true; } /* lit_is_utf8_string_valid */ +/** + * Check if the code unit type is low surrogate + * + * @return true / false + */ +bool +lit_is_code_unit_low_surrogate (ecma_char_t code_unit) /**< code unit */ +{ + return LIT_UTF16_LOW_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_LOW_SURROGATE_MAX; +} /* lit_is_code_unit_low_surrogate */ + +/** + * Check if the code unit type is high surrogate + * + * @return true / false + */ +bool +lit_is_code_unit_high_surrogate (ecma_char_t code_unit) /**< code unit */ +{ + return LIT_UTF16_HIGH_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_HIGH_SURROGATE_MAX; +} /* lit_is_code_unit_high_surrogate */ + /** * Initialize iterator for traversing utf-8 string as a string of code units * @@ -455,6 +479,48 @@ lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p) return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false); } /* lit_utf8_iterator_is_bos */ +/** + * Get offset of the iterator + * + * @return: current offset in bytes of the iterator from the beginning of buffer + */ +lit_utf8_size_t +lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *iter_p) /**< iterator */ +{ + return iter_p->buf_pos.offset; +} /* lit_utf8_iterator_get_offset */ + +/** + * Set iterator to point to specified offset + */ +void +lit_utf8_iterator_set_offset (lit_utf8_iterator_t *iter_p, /**< pointer to iterator */ + lit_utf8_size_t offset) /**< offset from the begging of the iterated buffer */ +{ + JERRY_ASSERT (offset <= iter_p->buf_size); + +#ifndef JERRY_NDEBUG + if (offset < iter_p->buf_size) + { + JERRY_ASSERT (((*(iter_p->buf_p + offset)) & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER); + } +#endif + + iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK; + iter_p->buf_pos.is_non_bmp_middle = false; +} /* lit_utf8_iterator_set_offset */ + +/** + * Get pointer to the current utf-8 char which iterator points to + * + * @return: pointer to utf-8 char + */ +lit_utf8_byte_t * +lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *iter_p) /**< iterator */ +{ + return (lit_utf8_byte_t *) iter_p->buf_p + iter_p->buf_pos.offset; +} /* lit_utf8_iterator_get_ptr */ + /** * Calculate size of a zero-terminated utf-8 string * @@ -702,6 +768,28 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */ } } /* lit_code_unit_to_utf8 */ +/** + * Convert surrogate pair to code point + * + * @return code point + */ +lit_code_point_t +lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */ + ecma_char_t low_surrogate) /**< low surrogate code point */ +{ + JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate)); + JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate)); + + lit_code_point_t code_point; + code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN); + code_point <<= LIT_UTF16_BITS_IN_SURROGATE; + + code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT; + + code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN); + return code_point; +} /* lit_surrogate_pair_to_code_point */ + /** * Compare utf-8 string to utf-8 string * @@ -757,3 +845,20 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**< return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2)); } /* lit_compare_utf8_strings_relational */ + +/** + * Print code unit to standard output + */ +void +lit_put_ecma_char (ecma_char_t ecma_char) /**< code unit */ +{ + if (ecma_char <= LIT_UTF8_1_BYTE_CODE_POINT_MAX) + { + putchar (ecma_char); + } + else + { + FIXME ("Support unicode characters printing."); + putchar ('_'); + } +} /* lit_put_ecma_char */ diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index 0c3fffa7a..e1bf5bd85 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -17,7 +17,6 @@ #define LIT_UNICODE_HELPERS_H #include "jrt.h" -#include "lit-char-helpers.h" #include "lit-globals.h" /** @@ -27,7 +26,7 @@ /** * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The - * Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7). + * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404). */ #define LIT_UNICODE_CODE_POINT_NULL (0x0) #define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF) @@ -112,6 +111,10 @@ typedef struct /* validation */ bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t); +/* checks */ +bool lit_is_code_unit_low_surrogate (ecma_char_t); +bool lit_is_code_unit_high_surrogate (ecma_char_t); + /* iteration */ lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t); @@ -136,6 +139,11 @@ ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *); bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *); bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *); +lit_utf8_size_t lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *); +void lit_utf8_iterator_set_offset (lit_utf8_iterator_t *, lit_utf8_size_t); + +lit_utf8_byte_t *lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *); + /* size */ lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *); @@ -152,6 +160,7 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t); /* conversion */ lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *); lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *); +lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t); /* comparison */ bool lit_compare_utf8_strings (const lit_utf8_byte_t *, @@ -169,4 +178,7 @@ lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *); +/* print */ +void lit_put_ecma_char (ecma_char_t); + #endif /* LIT_UNICODE_HELPERS_H */