diff --git a/jerry-core/ecma/base/ecma-globals.h b/jerry-core/ecma/base/ecma-globals.h index 97e8b771a..972b969ae 100644 --- a/jerry-core/ecma/base/ecma-globals.h +++ b/jerry-core/ecma/base/ecma-globals.h @@ -830,6 +830,12 @@ typedef struct ecma_string_t { mem_cpointer_t string1_cp : ECMA_POINTER_FIELD_WIDTH; mem_cpointer_t string2_cp : ECMA_POINTER_FIELD_WIDTH; + + /** + * Flag indicating that last code_unit of first string in concatenation is high surrogate + * and first code_unit of second string is low surrogate + */ + unsigned int is_surrogate_pair_sliced : 1; } concatenation; /** Identifier of magic string */ diff --git a/jerry-core/ecma/base/ecma-helpers-string.cpp b/jerry-core/ecma/base/ecma-helpers-string.cpp index 8969116c6..4e05d2ddd 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.cpp +++ b/jerry-core/ecma/base/ecma-helpers-string.cpp @@ -658,16 +658,47 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */ string1_p = ecma_copy_or_ref_ecma_string (string1_p); string2_p = ecma_copy_or_ref_ecma_string (string2_p); + ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1); + ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0); + + string_desc_p->u.concatenation.is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit) + && lit_is_code_unit_low_surrogate (str2_first_code_unit)); + ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string1_cp, string1_p); ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string2_cp, string2_p); + JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2); + if (str2_size >= LIT_STRING_HASH_LAST_BYTES_COUNT) { - string_desc_p->hash = string2_p->hash; + if (str2_size >= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + LIT_STRING_HASH_LAST_BYTES_COUNT + || !string_desc_p->u.concatenation.is_surrogate_pair_sliced) + { + string_desc_p->hash = string2_p->hash; + } + else + { + const lit_utf8_size_t bytes_buf_size = str2_size + 1; + lit_utf8_byte_t bytes_buf[LIT_UTF8_MAX_BYTES_IN_CODE_POINT + 1]; + + lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit, + str2_first_code_unit); + lit_utf8_size_t idx = lit_code_point_to_utf8 (code_point, bytes_buf); + JERRY_ASSERT (idx = LIT_UTF8_MAX_BYTES_IN_CODE_POINT); + + if (str2_size > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT) + { + bytes_buf[idx] = ecma_string_get_byte_at_pos (string2_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT); + } + + string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf + bytes_buf_size - + LIT_STRING_HASH_LAST_BYTES_COUNT, + LIT_STRING_HASH_LAST_BYTES_COUNT); + } + } else { - JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2); JERRY_ASSERT (str2_size == 1); lit_utf8_byte_t bytes_buf[LIT_STRING_HASH_LAST_BYTES_COUNT] = @@ -965,7 +996,7 @@ ecma_string_to_number (const ecma_string_t *str_p) /**< ecma-string */ * Convert ecma-string's contents to a utf-8 string and put it to the buffer. * * @return number of bytes, actually copied to the buffer - if string's content was copied successfully; - * otherwise (in case size of buffer is insuficcient) - negative number, which is calculated + * otherwise (in case size of buffer is insufficient) - negative number, which is calculated * as negation of buffer size, that is required to hold the string's content. */ ssize_t @@ -1039,13 +1070,37 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string bytes_copied1 = ecma_string_to_utf8_string (string1_p, dest_p, buffer_size); JERRY_ASSERT (bytes_copied1 > 0); - /* one character, which is the null character at end of string, will be overwritten */ dest_p += ecma_string_get_size (string1_p); - bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1); - JERRY_ASSERT (bytes_copied2 > 0); + if (!string_desc_p->u.concatenation.is_surrogate_pair_sliced) + { + bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1); + JERRY_ASSERT (bytes_copied2 > 0); + } + else + { + dest_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT; - JERRY_ASSERT (required_buffer_size == bytes_copied1 + bytes_copied2); + ecma_char_t high_surrogate = lit_utf8_string_code_unit_at (dest_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0); + JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate)); + + bytes_copied2 = ecma_string_to_utf8_string (string2_p, + dest_p + 1, + buffer_size - bytes_copied1 + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT); + JERRY_ASSERT (bytes_copied2 > 0); + + ecma_char_t low_surrogate = lit_utf8_string_code_unit_at (dest_p + 1, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0); + JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate)); + + lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (high_surrogate, + low_surrogate); + lit_code_point_to_utf8 (surrogate_code_point, dest_p); + } + + JERRY_ASSERT (required_buffer_size == (bytes_copied1 + bytes_copied2 - + (string_desc_p->u.concatenation.is_surrogate_pair_sliced + ? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF + : 0))); break; } @@ -1434,8 +1489,8 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */ string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp); string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp); - TODO ("Check surrogate code units on strings boundaries"); - return ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p); + return (ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p) - + string_p->u.concatenation.is_surrogate_pair_sliced); } } /* ecma_string_get_length */ @@ -1443,7 +1498,7 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */ /** * Get size of ecma-string * - * @return number of bytes in the string + * @return number of bytes in the buffer needed to represent the string */ lit_utf8_size_t ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */ @@ -1520,7 +1575,10 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */ string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp); string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp); - return ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p); + return (ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p) - + (lit_utf8_size_t) (string_p->u.concatenation.is_surrogate_pair_sliced + ? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF + : 0)); } } /* ecma_string_get_size */ diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index 03584a3a5..cf846f563 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -73,6 +73,12 @@ #define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000) #define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX) +/** + * Differnce between byte count needed to represent code point greater than 0xFFFF + * in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required) + */ +#define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT) + /** * Width of the offset field in lit_utf8_iterator_pos_t structure */ diff --git a/tests/jerry/string-surrogates-concat.js b/tests/jerry/string-surrogates-concat.js new file mode 100644 index 000000000..727297c86 --- /dev/null +++ b/tests/jerry/string-surrogates-concat.js @@ -0,0 +1,23 @@ +// Copyright 2015 Samsung Electronics Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +var str0 = "𐐀"; +var str1 = "\ud801\udc00"; +var str2 = "\ud801"; +var str3 = "\udc00"; + +var str_concat = str2 + str3; + +assert(str0 == str_concat); +assert(str1 == str_concat);