From ffaca583f38451896d70039e888eb75bf177b17c Mon Sep 17 00:00:00 2001 From: Robert Sipka Date: Wed, 16 Nov 2016 10:34:12 +0100 Subject: [PATCH] Add API functions to create string from a valid UTF-8 string. (#1430) JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com --- jerry-core/ecma/base/ecma-helpers-string.c | 114 +++++++++++++++++++++ jerry-core/ecma/base/ecma-helpers.h | 1 + jerry-core/jerry-api.h | 2 + jerry-core/jerry.c | 34 ++++++ 4 files changed, 151 insertions(+) diff --git a/jerry-core/ecma/base/ecma-helpers-string.c b/jerry-core/ecma/base/ecma-helpers-string.c index b890f2640..181e70962 100644 --- a/jerry-core/ecma/base/ecma-helpers-string.c +++ b/jerry-core/ecma/base/ecma-helpers-string.c @@ -215,6 +215,120 @@ ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *string_p, /**< utf-8 stri return string_desc_p; } /* ecma_new_ecma_string_from_utf8 */ +/** + * Allocate a new ecma-string and initialize it from the utf8 string argument. + * All 4-bytes long unicode sequences are converted into two 3-bytes long sequences. + * + * @return pointer to ecma-string descriptor + */ +ecma_string_t * +ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *string_p, /**< utf-8 string */ + lit_utf8_size_t string_size) /**< utf-8 string size */ +{ + JERRY_ASSERT (string_p != NULL || string_size == 0); + + ecma_string_t *string_desc_p = NULL; + + ecma_length_t string_length = 0; + lit_utf8_size_t converted_string_size = 0; + lit_utf8_size_t pos = 0; + + /* Calculate the required length and size information of the converted cesu-8 encoded string */ + while (pos < string_size) + { + if ((string_p[pos] & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) + { + pos++; + } + else if ((string_p[pos] & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) + { + pos += 2; + } + else if ((string_p[pos] & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER) + { + pos += 3; + } + else + { + JERRY_ASSERT ((string_p[pos] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER); + pos += 4; + converted_string_size += 2; + } + + string_length++; + } + + JERRY_ASSERT (pos == string_size); + + if (converted_string_size == 0) + { + return ecma_new_ecma_string_from_utf8 (string_p, string_size); + } + else + { + converted_string_size += string_size; + + JERRY_ASSERT (lit_is_utf8_string_valid (string_p, string_size)); + + lit_utf8_byte_t *data_p; + + if (likely (string_size <= UINT16_MAX)) + { + string_desc_p = jmem_heap_alloc_block (sizeof (ecma_string_t) + converted_string_size); + + string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_UTF8_STRING | ECMA_STRING_REF_ONE; + string_desc_p->u.common_field = 0; + string_desc_p->u.utf8_string.size = (uint16_t) converted_string_size; + string_desc_p->u.utf8_string.length = (uint16_t) string_length; + + data_p = (lit_utf8_byte_t *) (string_desc_p + 1); + } + else + { + string_desc_p = jmem_heap_alloc_block (sizeof (ecma_long_string_t) + converted_string_size); + + string_desc_p->refs_and_container = ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING | ECMA_STRING_REF_ONE; + string_desc_p->u.common_field = 0; + string_desc_p->u.long_utf8_string_size = converted_string_size; + + ecma_long_string_t *long_string_desc_p = (ecma_long_string_t *) string_desc_p; + long_string_desc_p->long_utf8_string_length = string_length; + + data_p = (lit_utf8_byte_t *) (long_string_desc_p + 1); + } + + pos = 0; + + while (pos < string_size) + { + if ((string_p[pos] & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER) + { + /* Processing 4 byte unicode sequence. Always converted to two 3 byte long sequence. */ + uint32_t character = ((((uint32_t) string_p[pos++]) & 0x7) << 18); + character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 12); + character |= ((((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK) << 6); + character |= (((uint32_t) string_p[pos++]) & LIT_UTF8_LAST_6_BITS_MASK); + + JERRY_ASSERT (character >= 0x10000); + character -= 0x10000; + + data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xd800 | (character >> 10))); + data_p += lit_char_to_utf8_bytes (data_p, (ecma_char_t) (0xdc00 | (character & LIT_UTF16_LAST_10_BITS_MASK))); + } + else + { + *data_p++ = string_p[pos++]; + } + } + + JERRY_ASSERT (pos == string_size); + + string_desc_p->hash = lit_utf8_string_calc_hash (data_p, converted_string_size); + } + + return string_desc_p; +} /* ecma_new_ecma_string_from_utf8_converted_to_cesu8 */ + /** * Allocate new ecma-string and fill it with cesu-8 character which represents specified code unit * diff --git a/jerry-core/ecma/base/ecma-helpers.h b/jerry-core/ecma/base/ecma-helpers.h index 881d0bb4a..fa2829c94 100644 --- a/jerry-core/ecma/base/ecma-helpers.h +++ b/jerry-core/ecma/base/ecma-helpers.h @@ -164,6 +164,7 @@ extern void ecma_free_value_if_not_object (ecma_value_t); /* ecma-helpers-string.c */ extern ecma_string_t *ecma_new_ecma_string_from_utf8 (const lit_utf8_byte_t *, lit_utf8_size_t); +extern ecma_string_t *ecma_new_ecma_string_from_utf8_converted_to_cesu8 (const lit_utf8_byte_t *, lit_utf8_size_t); extern ecma_string_t *ecma_new_ecma_string_from_code_unit (ecma_char_t); extern ecma_string_t *ecma_new_ecma_string_from_uint32 (uint32_t); extern ecma_string_t *ecma_new_ecma_string_from_number (ecma_number_t); diff --git a/jerry-core/jerry-api.h b/jerry-core/jerry-api.h index 378dfd36b..8037fab7e 100644 --- a/jerry-core/jerry-api.h +++ b/jerry-core/jerry-api.h @@ -248,6 +248,8 @@ jerry_value_t jerry_create_number_infinity (bool); jerry_value_t jerry_create_number_nan (void); jerry_value_t jerry_create_null (void); jerry_value_t jerry_create_object (void); +jerry_value_t jerry_create_string_from_utf8 (const jerry_char_t *); +jerry_value_t jerry_create_string_sz_from_utf8 (const jerry_char_t *, jerry_size_t); jerry_value_t jerry_create_string (const jerry_char_t *); jerry_value_t jerry_create_string_sz (const jerry_char_t *, jerry_size_t); jerry_value_t jerry_create_undefined (void); diff --git a/jerry-core/jerry.c b/jerry-core/jerry.c index 926508969..36f22e3e9 100644 --- a/jerry-core/jerry.c +++ b/jerry-core/jerry.c @@ -913,6 +913,40 @@ jerry_create_object (void) return ecma_make_object_value (ecma_op_create_object_object_noarg ()); } /* jerry_create_object */ +/** + * Create string from a valid UTF8 string + * + * Note: + * returned value must be freed with jerry_release_value when it is no longer needed. + * + * @return value of the created string + */ +jerry_value_t +jerry_create_string_from_utf8 (const jerry_char_t *str_p) /**< pointer to string */ +{ + return jerry_create_string_sz_from_utf8 (str_p, lit_zt_utf8_string_size ((lit_utf8_byte_t *) str_p)); +} /* jerry_create_string_from_utf8 */ + +/** + * Create string from a valid UTF8 string + * + * Note: + * returned value must be freed with jerry_release_value when it is no longer needed. + * + * @return value of the created string + */ +jerry_value_t +jerry_create_string_sz_from_utf8 (const jerry_char_t *str_p, /**< pointer to string */ + jerry_size_t str_size) /**< string size */ +{ + jerry_assert_api_available (); + + ecma_string_t *ecma_str_p = ecma_new_ecma_string_from_utf8_converted_to_cesu8 ((lit_utf8_byte_t *) str_p, + (lit_utf8_size_t) str_size); + + return ecma_make_string_value (ecma_str_p); +} /* jerry_create_string_sz_from_utf8 */ + /** * Create string from a valid CESU8 string *