Add an API function to calculate the UTF-8 encoded string length from Jerry string. (#1460)

JerryScript-DCO-1.0-Signed-off-by: Robert Sipka rsipka.uszeged@partner.samsung.com
2025-12-15 16:29:21 +00:00 · 2016-12-01 10:51:23 +01:00 · 2016-12-01 10:51:23 +01:00 · 23cf7fd177
commit 23cf7fd177
parent 958344ee16
8 changed files with 160 additions and 11 deletions
--- a/docs/02.API-REFERENCE.md
+++ b/docs/02.API-REFERENCE.md
@ -1186,6 +1186,7 @@ jerry_get_utf8_string_size (const jerry_value_t value);
 **See also**

 - [jerry_create_string_from_utf8](#jerry_create_string_from_utf8)
+- [jerry_get_utf8_string_length](#jerry_get_utf8_string_length)

 ## jerry_get_string_length

@ -1223,6 +1224,44 @@ jerry_get_string_length (const jerry_value_t value);
 - [jerry_create_string](#jerry_create_string)
 - [jerry_get_string_size](#jerry_get_string_size)

+## jerry_get_utf8_string_length
+
+**Summary**
+
+Get the length of an UTF-8 encoded string. Returns zero, if the value parameter is not a string.
+
+*Note*: The difference from [jerry_get_string_length](#jerry_get_string_length) is that it
+returns with utf-8 string length instead of the cesu-8 string length.
+
+**Prototype**
+
+```c
+jerry_length_t
+jerry_get_utf8_string_length (const jerry_value_t value);
+```
+
+- `value` - input string value
+- return value - number of characters in the string
+
+**Example**
+
+```c
+{
+  const jerry_char_t char_array[] = "a string";
+  jerry_value_t string = jerry_create_string_from_utf8 (char_array);
+
+  jerry_length_t string_length = jerry_get_utf8_string_length (string);
+
+  ... // usage of string_length
+
+  jerry_release_value (string);
+}
+```
+
+**See also**
+
+- [jerry_create_string_from_utf8](#jerry_create_string_from_utf8)
+- [jerry_get_utf8_string_size](#jerry_get_utf8_string_size)

 ## jerry_string_to_char_buffer

--- a/jerry-core/ecma/base/ecma-helpers-string.c
+++ b/jerry-core/ecma/base/ecma-helpers-string.c
@ -1479,6 +1479,57 @@ ecma_string_get_length (const ecma_string_t *string_p) /**< ecma-string */
  }
 } /* ecma_string_get_length */

+/**
+ * Get length of UTF-8 encoded string length from ecma-string
+ *
+ * @return number of characters in the UTF-8 encoded string
+ */
+ecma_length_t
+ecma_string_get_utf8_length (const ecma_string_t *string_p) /**< ecma-string */
+{
+  switch (ECMA_STRING_GET_CONTAINER (string_p))
+  {
+    case ECMA_STRING_CONTAINER_HEAP_UTF8_STRING:
+    {
+      if (string_p->u.utf8_string.size == (lit_utf8_size_t) string_p->u.utf8_string.length)
+      {
+        return (ecma_length_t) (string_p->u.utf8_string.length);
+      }
+
+      return lit_get_utf8_length_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1),
+                                                  (lit_utf8_size_t) string_p->u.utf8_string.size);
+    }
+    case ECMA_STRING_CONTAINER_HEAP_LONG_UTF8_STRING:
+    {
+      ecma_long_string_t *long_string_p = (ecma_long_string_t *) string_p;
+      if (string_p->u.long_utf8_string_size == (lit_utf8_size_t) long_string_p->long_utf8_string_length)
+      {
+        return (ecma_length_t) (long_string_p->long_utf8_string_length);
+      }
+
+      return lit_get_utf8_length_of_cesu8_string ((const lit_utf8_byte_t *) (string_p + 1),
+                                                  (lit_utf8_size_t) string_p->u.long_utf8_string_size);
+    }
+    case ECMA_STRING_CONTAINER_UINT32_IN_DESC:
+    {
+      return ecma_string_get_number_in_desc_size (string_p->u.uint32_number);
+    }
+    case ECMA_STRING_CONTAINER_MAGIC_STRING:
+    {
+      JERRY_ASSERT (ECMA_STRING_IS_ASCII (lit_get_magic_string_utf8 (string_p->u.magic_string_id),
+                                          lit_get_magic_string_size (string_p->u.magic_string_id)));
+      return lit_get_magic_string_size (string_p->u.magic_string_id);
+    }
+    default:
+    {
+      JERRY_ASSERT (ECMA_STRING_GET_CONTAINER (string_p) == ECMA_STRING_CONTAINER_MAGIC_STRING_EX);
+
+      return lit_get_utf8_length_of_cesu8_string (lit_get_magic_string_ex_utf8 (string_p->u.magic_string_ex_id),
+                                                  lit_get_magic_string_ex_size (string_p->u.magic_string_ex_id));
+    }
+  }
+} /* ecma_string_get_utf8_length */
+
 /**
 * Get size of ecma-string
 *
--- a/jerry-core/ecma/base/ecma-helpers.h
+++ b/jerry-core/ecma/base/ecma-helpers.h
@ -195,6 +195,7 @@ extern bool ecma_string_compare_to_property_name (ecma_property_t, jmem_cpointer
 extern bool ecma_compare_ecma_strings (const ecma_string_t *, const ecma_string_t *);
 extern bool ecma_compare_ecma_strings_relational (const ecma_string_t *, const ecma_string_t *);
 extern ecma_length_t ecma_string_get_length (const ecma_string_t *);
+extern ecma_length_t ecma_string_get_utf8_length (const ecma_string_t *);
 extern lit_utf8_size_t ecma_string_get_size (const ecma_string_t *);
 extern lit_utf8_size_t ecma_string_get_utf8_size (const ecma_string_t *);
 extern ecma_char_t ecma_string_get_char_at_pos (const ecma_string_t *, ecma_length_t);
--- a/jerry-core/jerry-api.h
+++ b/jerry-core/jerry-api.h
@ -210,6 +210,7 @@ double jerry_get_number_value (const jerry_value_t);
 jerry_size_t jerry_get_string_size (const jerry_value_t);
 jerry_size_t jerry_get_utf8_string_size (const jerry_value_t);
 jerry_length_t jerry_get_string_length (const jerry_value_t);
+jerry_length_t jerry_get_utf8_string_length (const jerry_value_t);
 jerry_size_t jerry_string_to_char_buffer (const jerry_value_t, jerry_char_t *, jerry_size_t);

 /**
--- a/jerry-core/jerry.c
+++ b/jerry-core/jerry.c
@ -1086,6 +1086,27 @@ jerry_get_string_length (const jerry_value_t value) /**< input string */
  return ecma_string_get_length (ecma_get_string_from_value (value));
 } /* jerry_get_string_length */

+/**
+ * Get UTF-8 string length from Jerry string
+ *
+ * Note:
+ *      Returns 0, if the value parameter is not a string.
+ *
+ * @return number of characters in the string
+ */
+jerry_length_t
+jerry_get_utf8_string_length (const jerry_value_t value) /**< input string */
+{
+  jerry_assert_api_available ();
+
+  if (!ecma_is_value_string (value))
+  {
+    return 0;
+  }
+
+  return ecma_string_get_utf8_length (ecma_get_string_from_value (value));
+} /* jerry_get_utf8_string_length */
+
 /**
 * Copy the characters of a string into a specified buffer.
 *
--- a/jerry-core/lit/lit-strings.c
+++ b/jerry-core/lit/lit-strings.c
@ -292,22 +292,19 @@ lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu
 {
  lit_utf8_size_t offset = 0;
  lit_utf8_size_t utf8_buf_size = cesu8_buf_size;
+  ecma_char_t prev_ch = 0;

  while (offset < cesu8_buf_size)
  {
    ecma_char_t ch;
    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);

-    if (lit_is_code_point_utf16_high_surrogate (ch) && (offset < cesu8_buf_size))
+    if (lit_is_code_point_utf16_low_surrogate (ch) && lit_is_code_point_utf16_high_surrogate (prev_ch))
    {
-      ecma_char_t next_ch;
-      offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &next_ch);
-
-      if (lit_is_code_point_utf16_low_surrogate (next_ch))
-      {
-        utf8_buf_size -= 2;
-      }
+      utf8_buf_size -= 2;
    }
+
+    prev_ch = ch;
  }

  JERRY_ASSERT (offset == cesu8_buf_size);
@ -315,6 +312,37 @@ lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu
  return utf8_buf_size;
 } /* lit_get_utf8_size_of_cesu8_string */

+/**
+ * Calculate length of an utf-8 encoded string from cesu-8 encoded string
+ *
+ * @return length of an utf-8 encoded string
+ */
+ecma_length_t
+lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *cesu8_buf_p, /**< cesu-8 string */
+                                     lit_utf8_size_t cesu8_buf_size) /**< string size */
+{
+  lit_utf8_size_t offset = 0;
+  ecma_length_t utf8_length = 0;
+  ecma_char_t prev_ch = 0;
+
+  while (offset < cesu8_buf_size)
+  {
+    ecma_char_t ch;
+    offset += lit_read_code_unit_from_utf8 (cesu8_buf_p + offset, &ch);
+
+    if (!lit_is_code_point_utf16_low_surrogate (ch) || !lit_is_code_point_utf16_high_surrogate (prev_ch))
+    {
+      utf8_length++;
+    }
+
+    prev_ch = ch;
+  }
+
+  JERRY_ASSERT (offset == cesu8_buf_size);
+
+  return utf8_length;
+} /* lit_get_utf8_length_of_cesu8_string */
+
 /**
 * Decodes a unicode code point from non-empty utf-8-encoded buffer
 *
--- a/jerry-core/lit/lit-strings.h
+++ b/jerry-core/lit/lit-strings.h
@ -99,6 +99,7 @@ lit_utf8_size_t lit_get_utf8_size_of_cesu8_string (const lit_utf8_byte_t *, lit_

 /* length */
 ecma_length_t lit_utf8_string_length (const lit_utf8_byte_t *, lit_utf8_size_t);
+ecma_length_t lit_get_utf8_length_of_cesu8_string (const lit_utf8_byte_t *, lit_utf8_size_t);

 /* hash */
 lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *, lit_utf8_size_t);
--- a/tests/unit/test-api.c
+++ b/tests/unit/test-api.c
@ -305,7 +305,7 @@ main (void)

  bool is_ok;
  jerry_size_t sz, utf8_sz, cesu8_sz;
-  jerry_length_t cesu8_length;
+  jerry_length_t cesu8_length, utf8_length;
  jerry_value_t val_t, val_foo, val_bar, val_A, val_A_prototype, val_a, val_a_foo, val_value_field, val_p, val_np;
  jerry_value_t val_call_external;
  jerry_value_t global_obj_val, obj_val;
@ -365,10 +365,12 @@ main (void)
  args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x73\x74\x72\x3a \xf0\x9d\x94\xa3 \xf0\x9d\x94\xa4");

  cesu8_length = jerry_get_string_length (args[0]);
+  utf8_length = jerry_get_utf8_string_length (args[0]);
+
  cesu8_sz = jerry_get_string_size (args[0]);
  utf8_sz =  jerry_get_utf8_string_size (args[0]);

-  TEST_ASSERT (cesu8_length == 10);
+  TEST_ASSERT (cesu8_length == 10 && utf8_length == 8);
  TEST_ASSERT (cesu8_sz != utf8_sz);
  TEST_ASSERT (utf8_sz == 14 && cesu8_sz == 18);
  jerry_release_value (args[0]);
@ -377,10 +379,12 @@ main (void)
  args[0] = jerry_create_string ((jerry_char_t *) "\x73\x74\x72\x3a \xed\xa0\x81\xed\xb0\x80");

  cesu8_length = jerry_get_string_length (args[0]);
+  utf8_length = jerry_get_utf8_string_length (args[0]);
+
  cesu8_sz = jerry_get_string_size (args[0]);
  utf8_sz =  jerry_get_utf8_string_size (args[0]);

-  TEST_ASSERT (cesu8_length == 7);
+  TEST_ASSERT (cesu8_length == 7 && utf8_length == 6);
  TEST_ASSERT (cesu8_sz != utf8_sz);
  TEST_ASSERT (utf8_sz == 9 && cesu8_sz == 11);

@ -390,9 +394,12 @@ main (void)
  args[0] = jerry_create_string_from_utf8 ((jerry_char_t *) "\x70\x72\x69\x63\x65\x3a \x31\x30\xe2\x82\xac");

  cesu8_length = jerry_get_string_length (args[0]);
+  utf8_length = jerry_get_utf8_string_length (args[0]);
+
  cesu8_sz = jerry_get_string_size (args[0]);
  utf8_sz =  jerry_get_utf8_string_size (args[0]);

+  TEST_ASSERT (cesu8_length == utf8_length);
  TEST_ASSERT (cesu8_length == 10);
  TEST_ASSERT (cesu8_sz == utf8_sz);
  TEST_ASSERT (utf8_sz == 12);