Fix concatenating of surrogates.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
2025-12-15 16:29:21 +00:00 · 2015-07-09 18:27:23 +03:00 · 2015-07-09 18:27:23 +03:00 · dbf3708255
commit dbf3708255
parent 9fc1958903
4 changed files with 104 additions and 11 deletions
--- a/jerry-core/ecma/base/ecma-globals.h
+++ b/jerry-core/ecma/base/ecma-globals.h
@ -830,6 +830,12 @@ typedef struct ecma_string_t
    {
      mem_cpointer_t string1_cp : ECMA_POINTER_FIELD_WIDTH;
      mem_cpointer_t string2_cp : ECMA_POINTER_FIELD_WIDTH;
+
+      /**
+       * Flag indicating that last code_unit of first string in concatenation is high surrogate
+       * and first code_unit of second string is low surrogate
+       */
+      unsigned int is_surrogate_pair_sliced : 1;
    } concatenation;

    /** Identifier of magic string */
--- a/jerry-core/ecma/base/ecma-helpers-string.cpp
+++ b/jerry-core/ecma/base/ecma-helpers-string.cpp
@ -658,16 +658,47 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
  string1_p = ecma_copy_or_ref_ecma_string (string1_p);
  string2_p = ecma_copy_or_ref_ecma_string (string2_p);

+  ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
+  ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);
+
+  string_desc_p->u.concatenation.is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
+                                                             && lit_is_code_unit_low_surrogate (str2_first_code_unit));
+
  ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string1_cp, string1_p);
  ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string2_cp, string2_p);

+  JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
+
  if (str2_size >= LIT_STRING_HASH_LAST_BYTES_COUNT)
  {
-    string_desc_p->hash = string2_p->hash;
+    if (str2_size >= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + LIT_STRING_HASH_LAST_BYTES_COUNT
+        || !string_desc_p->u.concatenation.is_surrogate_pair_sliced)
+    {
+      string_desc_p->hash = string2_p->hash;
+    }
+    else
+    {
+      const lit_utf8_size_t bytes_buf_size = str2_size + 1;
+      lit_utf8_byte_t bytes_buf[LIT_UTF8_MAX_BYTES_IN_CODE_POINT + 1];
+
+      lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
+                                                                              str2_first_code_unit);
+      lit_utf8_size_t idx = lit_code_point_to_utf8 (code_point, bytes_buf);
+      JERRY_ASSERT (idx = LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
+
+      if (str2_size > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT)
+      {
+        bytes_buf[idx] = ecma_string_get_byte_at_pos (string2_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
+      }
+
+      string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf + bytes_buf_size -
+                                                                  LIT_STRING_HASH_LAST_BYTES_COUNT,
+                                                                  LIT_STRING_HASH_LAST_BYTES_COUNT);
+    }
+
  }
  else
  {
-    JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
    JERRY_ASSERT (str2_size == 1);

    lit_utf8_byte_t bytes_buf[LIT_STRING_HASH_LAST_BYTES_COUNT] =
@ -965,7 +996,7 @@ ecma_string_to_number (const ecma_string_t *str_p) /**< ecma-string */
 * Convert ecma-string's contents to a utf-8 string and put it to the buffer.
 *
 * @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
- *         otherwise (in case size of buffer is insuficcient) - negative number, which is calculated
+ *         otherwise (in case size of buffer is insufficient) - negative number, which is calculated
 *         as negation of buffer size, that is required to hold the string's content.
 */
 ssize_t
@ -1039,13 +1070,37 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string
      bytes_copied1 = ecma_string_to_utf8_string (string1_p, dest_p, buffer_size);
      JERRY_ASSERT (bytes_copied1 > 0);

-      /* one character, which is the null character at end of string, will be overwritten */
      dest_p += ecma_string_get_size (string1_p);

-      bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1);
-      JERRY_ASSERT (bytes_copied2 > 0);
+      if (!string_desc_p->u.concatenation.is_surrogate_pair_sliced)
+      {
+        bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1);
+        JERRY_ASSERT (bytes_copied2 > 0);
+      }
+      else
+      {
+        dest_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;

-      JERRY_ASSERT (required_buffer_size == bytes_copied1 + bytes_copied2);
+        ecma_char_t high_surrogate = lit_utf8_string_code_unit_at (dest_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0);
+        JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
+
+        bytes_copied2 = ecma_string_to_utf8_string (string2_p,
+                                                    dest_p + 1,
+                                                    buffer_size - bytes_copied1 + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
+        JERRY_ASSERT (bytes_copied2 > 0);
+
+        ecma_char_t low_surrogate = lit_utf8_string_code_unit_at (dest_p + 1, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0);
+        JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
+
+        lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (high_surrogate,
+                                                                                          low_surrogate);
+        lit_code_point_to_utf8 (surrogate_code_point, dest_p);
+      }
+
+      JERRY_ASSERT (required_buffer_size == (bytes_copied1 + bytes_copied2 -
+                                             (string_desc_p->u.concatenation.is_surrogate_pair_sliced
+                                              ? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF
+                                              : 0)));

      break;
    }
@ -1434,8 +1489,8 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */
    string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp);
    string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp);

-    TODO ("Check surrogate code units on strings boundaries");
-    return ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p);
+    return (ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p) -
+            string_p->u.concatenation.is_surrogate_pair_sliced);
  }
 } /* ecma_string_get_length */

@ -1443,7 +1498,7 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */
 /**
 * Get size of ecma-string
 *
- * @return number of bytes in the string
+ * @return number of bytes in the buffer needed to represent the string
 */
 lit_utf8_size_t
 ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
@ -1520,7 +1575,10 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
    string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp);
    string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp);

-    return ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p);
+    return (ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p) -
+           (lit_utf8_size_t) (string_p->u.concatenation.is_surrogate_pair_sliced
+                              ? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF
+                              : 0));
  }
 } /* ecma_string_get_size */

--- a/jerry-core/lit/lit-strings.h
+++ b/jerry-core/lit/lit-strings.h
@ -73,6 +73,12 @@
 #define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
 #define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)

+/**
+ * Differnce between byte count needed to represent code point greater than 0xFFFF
+ * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required)
+ */
+#define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
+
 /**
 * Width of the offset field in lit_utf8_iterator_pos_t structure
 */
--- a/tests/jerry/string-surrogates-concat.js
+++ b/tests/jerry/string-surrogates-concat.js
@ -0,0 +1,23 @@
+// Copyright 2015 Samsung Electronics Co., Ltd.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+var str0 = "𐐀";
+var str1 = "\ud801\udc00";
+var str2 = "\ud801";
+var str3 = "\udc00";
+
+var str_concat = str2 + str3;
+
+assert(str0 == str_concat);
+assert(str1 == str_concat);