Support unicode whitespaces in string-to-number conversion.

JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com
2026-02-07 18:16:25 +00:00 · 2015-07-10 20:01:43 +03:00 · 2015-07-10 20:01:43 +03:00 · cf2bc459bb
commit cf2bc459bb
parent b3fa7d9765
2 changed files with 45 additions and 11 deletions
--- a/jerry-core/ecma/base/ecma-helpers-conversion.cpp
+++ b/jerry-core/ecma/base/ecma-helpers-conversion.cpp
@ -23,6 +23,7 @@
 #include "ecma-globals.h"
 #include "ecma-helpers.h"
 #include "jrt-libc-includes.h"
+#include "lit-char-helpers.h"
 #include "lit-magic-strings.h"

 /*
@ -343,7 +344,6 @@ ecma_utf8_string_to_number (const lit_utf8_byte_t *str_p, /**< utf-8 string */
  const lit_utf8_byte_t hex_lower_digits_range[10] = { 'a', 'f' };
  const lit_utf8_byte_t hex_upper_digits_range[10] = { 'A', 'F' };
  const lit_utf8_byte_t hex_x_chars[2] = { 'x', 'X' };
-  const lit_utf8_byte_t white_space[2] = { ' ', '\n' };
  const lit_utf8_byte_t e_chars[2] = { 'e', 'E' };
  const lit_utf8_byte_t plus_char = '+';
  const lit_utf8_byte_t minus_char = '-';
@ -354,23 +354,43 @@ ecma_utf8_string_to_number (const lit_utf8_byte_t *str_p, /**< utf-8 string */
    return ECMA_NUMBER_ZERO;
  }

-  const lit_utf8_byte_t *begin_p = str_p;
-  const lit_utf8_byte_t *end_p = begin_p + str_size - 1;
+  lit_utf8_iterator_t iter = lit_utf8_iterator_create (str_p, str_size);
+  ecma_char_t code_unit;

-  while (begin_p <= end_p
-         && (*begin_p == white_space[0]
-             || *begin_p == white_space[1]))
+  while (!lit_utf8_iterator_is_eos (&iter))
  {
-    begin_p++;
+    code_unit = lit_utf8_iterator_peek_next (&iter);
+    if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit))
+    {
+      lit_utf8_iterator_incr (&iter);
+    }
+    else
+    {
+      break;
+    }
  }

-  while (begin_p <= end_p
-         && (*end_p == white_space[0]
-             || *end_p == white_space[1]))
+  JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle);
+  const lit_utf8_byte_t *begin_p = iter.buf_p + iter.buf_pos.offset;
+
+  iter = lit_utf8_iterator_create (iter.buf_p + iter.buf_pos.offset, str_size - iter.buf_pos.offset);
+  lit_utf8_iterator_seek_eos (&iter);
+  while (!lit_utf8_iterator_is_bos (&iter))
  {
-    end_p--;
+    code_unit = lit_utf8_iterator_peek_prev (&iter);
+    if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit))
+    {
+      lit_utf8_iterator_decr (&iter);
+    }
+    else
+    {
+      break;
+    }
  }

+  JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle);
+  const lit_utf8_byte_t *end_p = iter.buf_p + iter.buf_pos.offset - 1;
+
  if (begin_p > end_p)
  {
    return ECMA_NUMBER_ZERO;
--- a/jerry-core/lit/lit-unicode-ranges.inc.h
+++ b/jerry-core/lit/lit-unicode-ranges.inc.h
@ -21,6 +21,9 @@
 *          http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
 *
 * The range lists were generated using tools/print-unicode-ranges.sh script from UnicodeData-3.0.0.txt
+ *
+ * NOTE:
+ *   Some ranges in "Separator, Space" category were added manually, see the according definitions.
 */

 /**
@ -2430,6 +2433,17 @@ LIT_UNICODE_RANGE_NO (0x3280, 0x3289) /* CIRCLED IDEOGRAPH ONE
 #ifndef LIT_UNICODE_RANGE_ZS
 # define LIT_UNICODE_RANGE_ZS(range_begin, range_end)
 #endif /* !LIT_UNICODE_RANGE_ZS */
+
+LIT_UNICODE_RANGE_ZS (0x180E, 0x180E) /* MONGOLIAN VOWEL SEPARATOR (manually added)
+                                       * This character doesn't belong to Zs category according
+                                       * UnicodeData-3.0.0.txt, but it should be supported according to
+                                       * ch09/9.3/9.3.1/S9.3.1_A2.js form test262 suite. */
+
+LIT_UNICODE_RANGE_ZS (0x205F, 0x205F) /* MEDIUM MATHEMATICAL SPACE (manually added)
+                                       * This character doesn't belong to Zs category according
+                                       * UnicodeData-3.0.0.txt, but it should be supported according to
+                                       * ch09/9.3/9.3.1/S9.3.1_A2.js form test262 suite. */
+
 LIT_UNICODE_RANGE_ZS (0x0020, 0x0020) /* SPACE */

 LIT_UNICODE_RANGE_ZS (0x00A0, 0x00A0) /* NO-BREAK SPACE */