Fix concatenating of surrogates.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
This commit is contained in:
Andrey Shitov 2015-07-09 18:27:23 +03:00 committed by Evgeny Gavrin
parent 9fc1958903
commit dbf3708255
4 changed files with 104 additions and 11 deletions

View File

@ -830,6 +830,12 @@ typedef struct ecma_string_t
{
mem_cpointer_t string1_cp : ECMA_POINTER_FIELD_WIDTH;
mem_cpointer_t string2_cp : ECMA_POINTER_FIELD_WIDTH;
/**
* Flag indicating that last code_unit of first string in concatenation is high surrogate
* and first code_unit of second string is low surrogate
*/
unsigned int is_surrogate_pair_sliced : 1;
} concatenation;
/** Identifier of magic string */

View File

@ -658,16 +658,47 @@ ecma_concat_ecma_strings (ecma_string_t *string1_p, /**< first ecma-string */
string1_p = ecma_copy_or_ref_ecma_string (string1_p);
string2_p = ecma_copy_or_ref_ecma_string (string2_p);
ecma_char_t str1_last_code_unit = ecma_string_get_char_at_pos (string1_p, ecma_string_get_length (string1_p) - 1);
ecma_char_t str2_first_code_unit = ecma_string_get_char_at_pos (string2_p, 0);
string_desc_p->u.concatenation.is_surrogate_pair_sliced = (lit_is_code_unit_high_surrogate (str1_last_code_unit)
&& lit_is_code_unit_low_surrogate (str2_first_code_unit));
ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string1_cp, string1_p);
ECMA_SET_NON_NULL_POINTER (string_desc_p->u.concatenation.string2_cp, string2_p);
JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
if (str2_size >= LIT_STRING_HASH_LAST_BYTES_COUNT)
{
string_desc_p->hash = string2_p->hash;
if (str2_size >= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT + LIT_STRING_HASH_LAST_BYTES_COUNT
|| !string_desc_p->u.concatenation.is_surrogate_pair_sliced)
{
string_desc_p->hash = string2_p->hash;
}
else
{
const lit_utf8_size_t bytes_buf_size = str2_size + 1;
lit_utf8_byte_t bytes_buf[LIT_UTF8_MAX_BYTES_IN_CODE_POINT + 1];
lit_code_point_t code_point = lit_convert_surrogate_pair_to_code_point (str1_last_code_unit,
str2_first_code_unit);
lit_utf8_size_t idx = lit_code_point_to_utf8 (code_point, bytes_buf);
JERRY_ASSERT (idx = LIT_UTF8_MAX_BYTES_IN_CODE_POINT);
if (str2_size > LIT_UTF8_MAX_BYTES_IN_CODE_UNIT)
{
bytes_buf[idx] = ecma_string_get_byte_at_pos (string2_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
}
string_desc_p->hash = lit_utf8_string_calc_hash_last_bytes (bytes_buf + bytes_buf_size -
LIT_STRING_HASH_LAST_BYTES_COUNT,
LIT_STRING_HASH_LAST_BYTES_COUNT);
}
}
else
{
JERRY_STATIC_ASSERT (LIT_STRING_HASH_LAST_BYTES_COUNT == 2);
JERRY_ASSERT (str2_size == 1);
lit_utf8_byte_t bytes_buf[LIT_STRING_HASH_LAST_BYTES_COUNT] =
@ -965,7 +996,7 @@ ecma_string_to_number (const ecma_string_t *str_p) /**< ecma-string */
* Convert ecma-string's contents to a utf-8 string and put it to the buffer.
*
* @return number of bytes, actually copied to the buffer - if string's content was copied successfully;
* otherwise (in case size of buffer is insuficcient) - negative number, which is calculated
* otherwise (in case size of buffer is insufficient) - negative number, which is calculated
* as negation of buffer size, that is required to hold the string's content.
*/
ssize_t
@ -1039,13 +1070,37 @@ ecma_string_to_utf8_string (const ecma_string_t *string_desc_p, /**< ecma-string
bytes_copied1 = ecma_string_to_utf8_string (string1_p, dest_p, buffer_size);
JERRY_ASSERT (bytes_copied1 > 0);
/* one character, which is the null character at end of string, will be overwritten */
dest_p += ecma_string_get_size (string1_p);
bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1);
JERRY_ASSERT (bytes_copied2 > 0);
if (!string_desc_p->u.concatenation.is_surrogate_pair_sliced)
{
bytes_copied2 = ecma_string_to_utf8_string (string2_p, dest_p, buffer_size - bytes_copied1);
JERRY_ASSERT (bytes_copied2 > 0);
}
else
{
dest_p -= LIT_UTF8_MAX_BYTES_IN_CODE_UNIT;
JERRY_ASSERT (required_buffer_size == bytes_copied1 + bytes_copied2);
ecma_char_t high_surrogate = lit_utf8_string_code_unit_at (dest_p, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0);
JERRY_ASSERT (lit_is_code_unit_high_surrogate (high_surrogate));
bytes_copied2 = ecma_string_to_utf8_string (string2_p,
dest_p + 1,
buffer_size - bytes_copied1 + LIT_UTF8_MAX_BYTES_IN_CODE_UNIT);
JERRY_ASSERT (bytes_copied2 > 0);
ecma_char_t low_surrogate = lit_utf8_string_code_unit_at (dest_p + 1, LIT_UTF8_MAX_BYTES_IN_CODE_UNIT, 0);
JERRY_ASSERT (lit_is_code_unit_low_surrogate (low_surrogate));
lit_code_point_t surrogate_code_point = lit_convert_surrogate_pair_to_code_point (high_surrogate,
low_surrogate);
lit_code_point_to_utf8 (surrogate_code_point, dest_p);
}
JERRY_ASSERT (required_buffer_size == (bytes_copied1 + bytes_copied2 -
(string_desc_p->u.concatenation.is_surrogate_pair_sliced
? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF
: 0)));
break;
}
@ -1434,8 +1489,8 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */
string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp);
string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp);
TODO ("Check surrogate code units on strings boundaries");
return ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p);
return (ecma_string_get_length (string1_p) + ecma_string_get_length (string2_p) -
string_p->u.concatenation.is_surrogate_pair_sliced);
}
} /* ecma_string_get_length */
@ -1443,7 +1498,7 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */
/**
* Get size of ecma-string
*
* @return number of bytes in the string
* @return number of bytes in the buffer needed to represent the string
*/
lit_utf8_size_t
ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
@ -1520,7 +1575,10 @@ ecma_string_get_size (const ecma_string_t *string_p) /**< ecma-string */
string1_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string1_cp);
string2_p = ECMA_GET_NON_NULL_POINTER (ecma_string_t, string_p->u.concatenation.string2_cp);
return ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p);
return (ecma_string_get_size (string1_p) + ecma_string_get_size (string2_p) -
(lit_utf8_size_t) (string_p->u.concatenation.is_surrogate_pair_sliced
? LIT_UTF8_CESU8_SURROGATE_SIZE_DIF
: 0));
}
} /* ecma_string_get_size */

View File

@ -73,6 +73,12 @@
#define LIT_UTF8_4_BYTE_CODE_POINT_MIN (0x10000)
#define LIT_UTF8_4_BYTE_CODE_POINT_MAX (LIT_UNICODE_CODE_POINT_MAX)
/**
* Differnce between byte count needed to represent code point greater than 0xFFFF
* in common UTF-8 (4 bytes required) and CESU-8 (6 bytes required)
*/
#define LIT_UTF8_CESU8_SURROGATE_SIZE_DIF (2 * LIT_UTF8_MAX_BYTES_IN_CODE_UNIT - LIT_UTF8_MAX_BYTES_IN_CODE_POINT)
/**
* Width of the offset field in lit_utf8_iterator_pos_t structure
*/

View File

@ -0,0 +1,23 @@
// Copyright 2015 Samsung Electronics Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
var str0 = "𐐀";
var str1 = "\ud801\udc00";
var str2 = "\ud801";
var str3 = "\udc00";
var str_concat = str2 + str3;
assert(str0 == str_concat);
assert(str1 == str_concat);