Use code unit instead of code point

JerryScript-DCO-1.0-Signed-off-by: László Langó llango.u-szeged@partner.samsung.com
2025-12-15 16:29:21 +00:00 · 2016-03-11 09:44:23 +01:00 · 2016-03-11 09:44:23 +01:00 · e1f20ad474
commit e1f20ad474
parent 005f73a6f0
7 changed files with 70 additions and 72 deletions
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
@ -1,4 +1,4 @@
-/* Copyright 2014-2015 Samsung Electronics Co., Ltd.
+/* Copyright 2014-2016 Samsung Electronics Co., Ltd.
 * Copyright 2015-2016 University of Szeged.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -96,25 +96,25 @@ ecma_builtin_global_object_print (ecma_value_t this_arg __attr_unused___, /**< t

    while (utf8_str_curr_p < utf8_str_end_p)
    {
-      ecma_char_t code_point = lit_utf8_read_next (&utf8_str_curr_p);
+      ecma_char_t code_unit = lit_utf8_read_next (&utf8_str_curr_p);

-      if (code_point == LIT_CHAR_NULL)
+      if (code_unit == LIT_CHAR_NULL)
      {
        printf ("\\u0000");
      }
-      else if (code_point <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
+      else if (code_unit <= LIT_UTF8_1_BYTE_CODE_POINT_MAX)
      {
-        printf ("%c", (char) code_point);
+        printf ("%c", (char) code_unit);
      }
      else
      {
-        JERRY_STATIC_ASSERT (sizeof (code_point) == 2,
+        JERRY_STATIC_ASSERT (sizeof (code_unit) == 2,
                             size_of_code_point_must_be_equal_to_2_bytes);

-        uint32_t byte_high = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_point,
+        uint32_t byte_high = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_unit,
                                                               JERRY_BITSINBYTE,
                                                               JERRY_BITSINBYTE);
-        uint32_t byte_low = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_point,
+        uint32_t byte_low = (uint32_t) JRT_EXTRACT_BIT_FIELD (ecma_char_t, code_unit,
                                                              0,
                                                              JERRY_BITSINBYTE);

@ -801,9 +801,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
      continue;
    }

-    lit_code_point_t decoded_byte;
+    ecma_char_t decoded_byte;

-    if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
+    if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
    {
      ret_value = ecma_raise_uri_error (ECMA_ERR_MSG (""));
      break;
@ -857,9 +857,9 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
        continue;
      }

-      lit_code_point_t decoded_byte;
+      ecma_char_t decoded_byte;

-      if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &decoded_byte))
+      if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &decoded_byte))
      {
        ret_value = ecma_raise_uri_error (ECMA_ERR_MSG (""));
        break;
@ -916,16 +916,16 @@ ecma_builtin_global_object_decode_uri_helper (ecma_value_t uri __attr_unused___,
          }
          else
          {
-            lit_code_point_t cp;
+            ecma_char_t chr;

-            if (!lit_read_code_point_from_hex (input_char_p + 1, 2, &cp)
-                || ((cp & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
+            if (!lit_read_code_unit_from_hex (input_char_p + 1, 2, &chr)
+                || ((chr & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER))
            {
              is_valid = false;
              break;
            }

-            octets[i] = (lit_utf8_byte_t) cp;
+            octets[i] = (lit_utf8_byte_t) chr;
            input_char_p += URI_ENCODED_BYTE_SIZE;
          }
        }
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-json.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-json.c
@ -178,15 +178,15 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
        }
        case LIT_CHAR_LOWERCASE_U:
        {
-          lit_code_point_t code_point;
+          ecma_char_t code_unit;

-          if (!(lit_read_code_point_from_hex (current_p + 1, 4, &code_point)))
+          if (!(lit_read_code_unit_from_hex (current_p + 1, 4, &code_unit)))
          {
            return;
          }

          current_p += 5;
-          write_p += lit_code_point_to_cesu8 (code_point, write_p);
+          write_p += lit_code_unit_to_utf8 (code_unit, write_p);
          continue;
        }
        default:
--- a/jerry-core/lit/lit-char-helpers.c
+++ b/jerry-core/lit/lit-char-helpers.c
@ -1,4 +1,5 @@
-/* Copyright 2015 Samsung Electronics Co., Ltd.
+/* Copyright 2015-2016 Samsung Electronics Co., Ltd.
+ * Copyright 2016 University of Szeged.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -283,38 +284,38 @@ lit_char_hex_to_int (ecma_char_t c) /**< code unit, corresponding to

 /**
 * Parse the next number_of_characters hexadecimal character,
- * and construct a code point from them. The buffer must
+ * and construct a code unit from them. The buffer must
 * be zero terminated.
 *
 * @return true if decoding was successful, false otherwise
 */
 bool
-lit_read_code_point_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with characters */
-                              lit_utf8_size_t number_of_characters, /**< number of characters to be read */
-                              lit_code_point_t *out_code_point_p) /**< [out] decoded result */
+lit_read_code_unit_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with characters */
+                             lit_utf8_size_t number_of_characters, /**< number of characters to be read */
+                             ecma_char_ptr_t out_code_unit_p) /**< [out] decoded result */
 {
-  lit_code_point_t code_point = 0;
+  ecma_char_t code_unit = LIT_CHAR_NULL;

  JERRY_ASSERT (number_of_characters >= 2 && number_of_characters <= 4);

  for (lit_utf8_size_t i = 0; i < number_of_characters; i++)
  {
-    code_point <<= 4;
+    code_unit = (ecma_char_t) (code_unit << 4u);

    if (*buf_p >= LIT_CHAR_ASCII_DIGITS_BEGIN
        && *buf_p <= LIT_CHAR_ASCII_DIGITS_END)
    {
-      code_point |= (uint32_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
+      code_unit |= (ecma_char_t) (*buf_p - LIT_CHAR_ASCII_DIGITS_BEGIN);
    }
    else if (*buf_p >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN
             && *buf_p <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_END)
    {
-      code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
+      code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_LOWERCASE_LETTERS_HEX_BEGIN - 10));
    }
    else if (*buf_p >= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN
             && *buf_p <= LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_END)
    {
-      code_point |= (uint32_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
+      code_unit |= (ecma_char_t) (*buf_p - (LIT_CHAR_ASCII_UPPERCASE_LETTERS_HEX_BEGIN - 10));
    }
    else
    {
@ -324,9 +325,9 @@ lit_read_code_point_from_hex (lit_utf8_byte_t *buf_p, /**< buffer with character
    buf_p++;
  }

-  *out_code_point_p = code_point;
+  *out_code_unit_p = code_unit;
  return true;
-} /* lit_read_code_point_from_hex */
+} /* lit_read_code_unit_from_hex */

 /**
 * Check if specified character is a word character (part of IsWordChar abstract operation)
--- a/jerry-core/lit/lit-char-helpers.h
+++ b/jerry-core/lit/lit-char-helpers.h
@ -1,4 +1,5 @@
 /* Copyright 2015-2016 Samsung Electronics Co., Ltd.
+ * Copyright 2016 University of Szeged.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -18,6 +19,8 @@

 #include "lit-globals.h"

+#define LIT_CHAR_UNDEF ((ecma_char_t) 0xFFFF) /* undefined character */
+
 /*
 * Format control characters (ECMA-262 v5, Table 1)
 */
@ -213,7 +216,7 @@ extern bool lit_char_is_hex_digit (ecma_char_t);
 extern uint32_t lit_char_hex_to_int (ecma_char_t);

 /* read a hex encoded code point from a zero terminated buffer */
-bool lit_read_code_point_from_hex (lit_utf8_byte_t *, lit_utf8_size_t, lit_code_point_t *);
+bool lit_read_code_unit_from_hex (lit_utf8_byte_t *, lit_utf8_size_t, ecma_char_ptr_t);

 /**
 * Null character
--- a/jerry-core/parser/regexp/re-compiler.c
+++ b/jerry-core/parser/regexp/re-compiler.c
@ -41,12 +41,12 @@
 */
 static void
 re_append_char_class (void *re_ctx_p, /**< RegExp compiler context */
-                      uint32_t start, /**< character class range from */
-                      uint32_t end) /**< character class range to */
+                      ecma_char_t start, /**< character class range from */
+                      ecma_char_t end) /**< character class range to */
 {
  re_compiler_ctx_t *ctx_p = (re_compiler_ctx_t *) re_ctx_p;
-  re_append_char (ctx_p->bytecode_ctx_p, (ecma_char_t) start);
-  re_append_char (ctx_p->bytecode_ctx_p, (ecma_char_t) end);
+  re_append_char (ctx_p->bytecode_ctx_p, start);
+  re_append_char (ctx_p->bytecode_ctx_p, end);
  ctx_p->parser_ctx_p->num_of_classes++;
 } /* re_append_char_class */

--- a/jerry-core/parser/regexp/re-parser.c
+++ b/jerry-core/parser/regexp/re-parser.c
@ -315,7 +315,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
 {
  re_token_type_t token_type = ((re_compiler_ctx_t *) re_ctx_p)->current_token.type;
  out_token_p->qmax = out_token_p->qmin = 1;
-  uint32_t start = RE_CHAR_UNDEF;
+  ecma_char_t start = LIT_CHAR_UNDEF;
  bool is_range = false;
  parser_ctx_p->num_of_classes = 0;

@ -332,11 +332,11 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
      return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string"));
    }

-    uint32_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);
+    ecma_char_t ch = lit_utf8_read_next (&parser_ctx_p->input_curr_p);

    if (ch == LIT_CHAR_RIGHT_SQUARE)
    {
-      if (start != RE_CHAR_UNDEF)
+      if (start != LIT_CHAR_UNDEF)
      {
        append_char_class (re_ctx_p, start, start);
      }
@ -349,7 +349,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '-'"));
      }

-      if (start != RE_CHAR_UNDEF
+      if (start != LIT_CHAR_UNDEF
          && !is_range
          && *parser_ctx_p->input_curr_p != LIT_CHAR_RIGHT_SQUARE)
      {
@ -412,40 +412,40 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
      }
      else if (ch == LIT_CHAR_LOWERCASE_X)
      {
-        lit_code_point_t code_point;
+        ecma_char_t code_unit;

-        if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 2, &code_point))
+        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
        {
          return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\x'"));
        }

        parser_ctx_p->input_curr_p += 2;
-        append_char_class (re_ctx_p, code_point, code_point);
+        append_char_class (re_ctx_p, code_unit, code_unit);
      }
      else if (ch == LIT_CHAR_LOWERCASE_U)
      {
-        lit_code_point_t code_point;
+        ecma_char_t code_unit;

-        if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 4, &code_point))
+        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
        {
          return ecma_raise_syntax_error (ECMA_ERR_MSG ("invalid character class, end of string after '\\u'"));
        }

        parser_ctx_p->input_curr_p += 4;
-        append_char_class (re_ctx_p, code_point, code_point);
+        append_char_class (re_ctx_p, code_unit, code_unit);
      }
      else if (ch == LIT_CHAR_LOWERCASE_D)
      {
        /* See ECMA-262 v5, 15.10.2.12 */
        append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_BEGIN, LIT_CHAR_ASCII_DIGITS_END);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
      else if (ch == LIT_CHAR_UPPERCASE_D)
      {
        /* See ECMA-262 v5, 15.10.2.12 */
        append_char_class (re_ctx_p, LIT_CHAR_NULL, LIT_CHAR_ASCII_DIGITS_BEGIN - 1);
        append_char_class (re_ctx_p, LIT_CHAR_ASCII_DIGITS_END + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
      else if (ch == LIT_CHAR_LOWERCASE_S)
      {
@ -461,7 +461,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        append_char_class (re_ctx_p, 0x205FUL, 0x205FUL); /* Medium Mathematical Space */
        append_char_class (re_ctx_p, 0x3000UL, 0x3000UL); /* Ideographic Space */
        append_char_class (re_ctx_p, LIT_CHAR_BOM, LIT_CHAR_BOM);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
      else if (ch == LIT_CHAR_UPPERCASE_S)
      {
@ -478,7 +478,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        append_char_class (re_ctx_p, 0x2060UL, 0x2FFFUL);
        append_char_class (re_ctx_p, 0x3001UL, LIT_CHAR_BOM - 1);
        append_char_class (re_ctx_p, LIT_CHAR_BOM + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
      else if (ch == LIT_CHAR_LOWERCASE_W)
      {
@ -487,7 +487,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_A, LIT_CHAR_UPPERCASE_Z);
        append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE, LIT_CHAR_UNDERSCORE);
        append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_A, LIT_CHAR_LOWERCASE_Z);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
      else if (ch == LIT_CHAR_UPPERCASE_W)
      {
@ -497,20 +497,19 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        append_char_class (re_ctx_p, LIT_CHAR_UPPERCASE_Z + 1, LIT_CHAR_UNDERSCORE - 1);
        append_char_class (re_ctx_p, LIT_CHAR_UNDERSCORE + 1, LIT_CHAR_LOWERCASE_A - 1);
        append_char_class (re_ctx_p, LIT_CHAR_LOWERCASE_Z + 1, LIT_UTF16_CODE_UNIT_MAX);
-        ch = RE_CHAR_UNDEF;
+        ch = LIT_CHAR_UNDEF;
      }
-      else if (ch <= LIT_UTF16_CODE_UNIT_MAX
-               && lit_char_is_octal_digit ((ecma_char_t) ch)
+      else if (lit_char_is_octal_digit ((ecma_char_t) ch)
               && ch != LIT_CHAR_0)
      {
        parser_ctx_p->input_curr_p--;
-        ch = re_parse_octal (parser_ctx_p);
+        ch = (ecma_char_t) re_parse_octal (parser_ctx_p);
      }
    } /* ch == LIT_CHAR_BACKSLASH */

-    if (ch == RE_CHAR_UNDEF)
+    if (ch == LIT_CHAR_UNDEF)
    {
-      if (start != RE_CHAR_UNDEF)
+      if (start != LIT_CHAR_UNDEF)
      {
        if (is_range)
        {
@ -519,13 +518,13 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
        else
        {
          append_char_class (re_ctx_p, start, start);
-          start = RE_CHAR_UNDEF;
+          start = LIT_CHAR_UNDEF;
        }
      }
    }
    else
    {
-      if (start != RE_CHAR_UNDEF)
+      if (start != LIT_CHAR_UNDEF)
      {
        if (is_range)
        {
@ -536,7 +535,7 @@ re_parse_char_class (re_parser_ctx_t *parser_ctx_p, /**< number of classes */
          else
          {
            append_char_class (re_ctx_p, start, ch);
-            start = RE_CHAR_UNDEF;
+            start = LIT_CHAR_UNDEF;
            is_range = false;
          }
        }
@ -667,28 +666,28 @@ re_parse_next_token (re_parser_ctx_t *parser_ctx_p, /**< RegExp parser context *
      else if (ch == LIT_CHAR_LOWERCASE_X
               && re_hex_lookup (parser_ctx_p, 2))
      {
-        lit_code_point_t code_point;
+        ecma_char_t code_unit;

-        if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 2, &code_point))
+        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 2, &code_unit))
        {
          return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
        }

        parser_ctx_p->input_curr_p += 2;
-        out_token_p->value = code_point;
+        out_token_p->value = code_unit;
      }
      else if (ch == LIT_CHAR_LOWERCASE_U
               && re_hex_lookup (parser_ctx_p, 4))
      {
-        lit_code_point_t code_point;
+        ecma_char_t code_unit;

-        if (!lit_read_code_point_from_hex (parser_ctx_p->input_curr_p, 4, &code_point))
+        if (!lit_read_code_unit_from_hex (parser_ctx_p->input_curr_p, 4, &code_unit))
        {
          return ecma_raise_syntax_error (ECMA_ERR_MSG ("decode error"));
        }

        parser_ctx_p->input_curr_p += 4;
-        out_token_p->value = code_point;
+        out_token_p->value = code_unit;
      }
      else if (ch == LIT_CHAR_LOWERCASE_D)
      {
--- a/jerry-core/parser/regexp/re-parser.h
+++ b/jerry-core/parser/regexp/re-parser.h
@ -75,11 +75,6 @@ typedef enum
 */
 #define RE_MAX_RE_DECESC_DIGITS 9

-/**
- * Undefined character (out of the range of the codeunit)
- */
-#define RE_CHAR_UNDEF 0xFFFFFFFF
-
 /**
 * RegExp token type
 */
@ -104,7 +99,7 @@ typedef struct
  uint32_t num_of_classes;        /**< number of character classes */
 } re_parser_ctx_t;

-typedef void (*re_char_class_callback) (void *re_ctx_p, uint32_t start, uint32_t end);
+typedef void (*re_char_class_callback) (void *re_ctx_p, ecma_char_t start, ecma_char_t end);

 ecma_value_t
 re_parse_char_class (re_parser_ctx_t *, re_char_class_callback, void *, re_token_t *);