Add helper functions for implementing unicode support in lexer.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
2025-12-15 16:29:21 +00:00 · 2015-07-03 21:41:03 +03:00 · 2015-07-03 21:41:03 +03:00 · d248d0944c
commit d248d0944c
parent c21399cd58
5 changed files with 122 additions and 3 deletions
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-helpers-json.cpp
@ -24,6 +24,7 @@
 #include "ecma-alloc.h"
 #include "ecma-helpers.h"
 #include "ecma-builtin-helpers.h"
+#include "lit-char-helpers.h"

 #define LIST_BLOCK_SIZE 256UL

--- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.cpp
@ -28,6 +28,7 @@
 #include "ecma-try-catch-macro.h"
 #include "jrt.h"
 #include "jrt-libc-includes.h"
+#include "lit-char-helpers.h"

 #ifndef CONFIG_ECMA_COMPACT_PROFILE_DISABLE_JSON_BUILTIN

--- a/jerry-core/lit/lit-literal.cpp
+++ b/jerry-core/lit/lit-literal.cpp
@ -594,7 +594,7 @@ lit_charset_record_get_length (literal_t lit) /**< literal */
    lit_iter.skip (bytes_to_skip);
    i += bytes_to_skip;

-    length++;
+    length += (bytes_to_skip > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) ? 2 : 1;
  }

 #ifndef JERRY_NDEBUG
--- a/jerry-core/lit/lit-strings.cpp
+++ b/jerry-core/lit/lit-strings.cpp
@ -17,6 +17,8 @@

 #include "jrt-libc-includes.h"

+JERRY_STATIC_ASSERT (sizeof (lit_utf8_iterator_pos_t) == sizeof (lit_utf8_size_t));
+
 /**
 * Validate utf-8 string
 *
@ -117,6 +119,28 @@ lit_is_utf8_string_valid (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
  return true;
 } /* lit_is_utf8_string_valid */

+/**
+ * Check if the code unit type is low surrogate
+ *
+ * @return true / false
+ */
+bool
+lit_is_code_unit_low_surrogate (ecma_char_t code_unit) /**< code unit */
+{
+  return LIT_UTF16_LOW_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_LOW_SURROGATE_MAX;
+} /* lit_is_code_unit_low_surrogate */
+
+/**
+ * Check if the code unit type is high surrogate
+ *
+ * @return true / false
+ */
+bool
+lit_is_code_unit_high_surrogate (ecma_char_t code_unit) /**< code unit */
+{
+  return LIT_UTF16_HIGH_SURROGATE_MIN <= code_unit && code_unit <= LIT_UTF16_HIGH_SURROGATE_MAX;
+} /* lit_is_code_unit_high_surrogate */
+
 /**
 * Initialize iterator for traversing utf-8 string as a string of code units
 *
@ -455,6 +479,48 @@ lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *iter_p)
  return (iter_p->buf_pos.offset == 0 && iter_p->buf_pos.is_non_bmp_middle == false);
 } /* lit_utf8_iterator_is_bos */

+/**
+ * Get offset of the iterator
+ *
+ * @return: current offset in bytes of the iterator from the beginning of buffer
+ */
+lit_utf8_size_t
+lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *iter_p) /**< iterator */
+{
+  return iter_p->buf_pos.offset;
+} /* lit_utf8_iterator_get_offset */
+
+/**
+ * Set iterator to point to specified offset
+ */
+void
+lit_utf8_iterator_set_offset (lit_utf8_iterator_t *iter_p, /**< pointer to iterator */
+                              lit_utf8_size_t offset) /**< offset from the begging of the iterated buffer */
+{
+  JERRY_ASSERT (offset <= iter_p->buf_size);
+
+#ifndef JERRY_NDEBUG
+  if (offset < iter_p->buf_size)
+  {
+    JERRY_ASSERT (((*(iter_p->buf_p + offset)) & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER);
+  }
+#endif
+
+  iter_p->buf_pos.offset = (offset) & LIT_ITERATOR_OFFSET_MASK;
+  iter_p->buf_pos.is_non_bmp_middle = false;
+} /* lit_utf8_iterator_set_offset */
+
+/**
+ * Get pointer to the current utf-8 char which iterator points to
+ *
+ * @return: pointer to utf-8 char
+ */
+lit_utf8_byte_t *
+lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *iter_p) /**< iterator */
+{
+  return (lit_utf8_byte_t *) iter_p->buf_p + iter_p->buf_pos.offset;
+} /* lit_utf8_iterator_get_ptr */
+
 /**
 * Calculate size of a zero-terminated utf-8 string
 *
@ -702,6 +768,28 @@ lit_code_point_to_utf8 (lit_code_point_t code_point, /**< code point */
  }
 } /* lit_code_unit_to_utf8 */

+/**
+ * Convert surrogate pair to code point
+ *
+ * @return code point
+ */
+lit_code_point_t
+lit_convert_surrogate_pair_to_code_point (ecma_char_t high_surrogate, /**< high surrogate code point */
+                                          ecma_char_t low_surrogate) /**< low surrogate code point */
+{
+  JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
+  JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
+
+  lit_code_point_t code_point;
+  code_point = (uint16_t) (high_surrogate - LIT_UTF16_HIGH_SURROGATE_MIN);
+  code_point <<= LIT_UTF16_BITS_IN_SURROGATE;
+
+  code_point += LIT_UTF16_FIRST_SURROGATE_CODE_POINT;
+
+  code_point |= (uint16_t) (low_surrogate - LIT_UTF16_LOW_SURROGATE_MIN);
+  return code_point;
+} /* lit_surrogate_pair_to_code_point */
+
 /**
 * Compare utf-8 string to utf-8 string
 *
@ -757,3 +845,20 @@ bool lit_compare_utf8_strings_relational (const lit_utf8_byte_t *string1_p, /**<

  return (lit_utf8_iterator_is_eos (&iter1) && !lit_utf8_iterator_is_eos (&iter2));
 } /* lit_compare_utf8_strings_relational */
+
+/**
+ * Print code unit to standard output
+ */
+void
+lit_put_ecma_char (ecma_char_t ecma_char) /**< code unit */
+{
+  if (ecma_char <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
+  {
+    putchar (ecma_char);
+  }
+  else
+  {
+    FIXME ("Support unicode characters printing.");
+    putchar ('_');
+  }
+} /* lit_put_ecma_char */
--- a/jerry-core/lit/lit-strings.h
+++ b/jerry-core/lit/lit-strings.h
@ -17,7 +17,6 @@
 #define LIT_UNICODE_HELPERS_H

 #include "jrt.h"
-#include "lit-char-helpers.h"
 #include "lit-globals.h"

 /**
@ -27,7 +26,7 @@

 /**
 * For the formal definition of Unicode transformation formats (UTF) see Section 3.9, Unicode Encoding Forms in The
- * Unicode Standard (http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404, tables 3-6, 3-7).
+ * Unicode Standard (http://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#G7404).
 */
 #define LIT_UNICODE_CODE_POINT_NULL (0x0)
 #define LIT_UNICODE_CODE_POINT_MAX (0x10FFFF)
@ -112,6 +111,10 @@ typedef struct
 /* validation */
 bool lit_is_utf8_string_valid (const lit_utf8_byte_t *, lit_utf8_size_t);

+/* checks */
+bool lit_is_code_unit_low_surrogate (ecma_char_t);
+bool lit_is_code_unit_high_surrogate (ecma_char_t);
+
 /* iteration */
 lit_utf8_iterator_t lit_utf8_iterator_create (const lit_utf8_byte_t *, lit_utf8_size_t);

@ -136,6 +139,11 @@ ecma_char_t lit_utf8_iterator_read_prev (lit_utf8_iterator_t *);
 bool lit_utf8_iterator_is_eos (const lit_utf8_iterator_t *);
 bool lit_utf8_iterator_is_bos (const lit_utf8_iterator_t *);

+lit_utf8_size_t lit_utf8_iterator_get_offset (const lit_utf8_iterator_t *);
+void lit_utf8_iterator_set_offset (lit_utf8_iterator_t *, lit_utf8_size_t);
+
+lit_utf8_byte_t *lit_utf8_iterator_get_ptr (const lit_utf8_iterator_t *);
+
 /* size */
 lit_utf8_size_t lit_zt_utf8_string_size (const lit_utf8_byte_t *);

@ -152,6 +160,7 @@ lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);
 /* conversion */
 lit_utf8_size_t lit_code_unit_to_utf8 (ecma_char_t, lit_utf8_byte_t *);
 lit_utf8_size_t lit_code_point_to_utf8 (lit_code_point_t, lit_utf8_byte_t *);
+lit_code_point_t lit_convert_surrogate_pair_to_code_point (ecma_char_t, ecma_char_t);

 /* comparison */
 bool lit_compare_utf8_strings (const lit_utf8_byte_t *,
@ -169,4 +178,7 @@ lit_utf8_size_t lit_read_code_point_from_utf8 (const lit_utf8_byte_t *,
                                               lit_utf8_size_t,
                                               lit_code_point_t *);

+/* print */
+void lit_put_ecma_char (ecma_char_t);
+
 #endif /* LIT_UNICODE_HELPERS_H */