From cf2bc459bb59728d4e4dc691f0e042c519e5bcba Mon Sep 17 00:00:00 2001 From: Andrey Shitov Date: Fri, 10 Jul 2015 20:01:43 +0300 Subject: [PATCH] Support unicode whitespaces in string-to-number conversion. JerryScript-DCO-1.0-Signed-off-by: Andrey Shitov a.shitov@samsung.com --- .../ecma/base/ecma-helpers-conversion.cpp | 42 ++++++++++++++----- jerry-core/lit/lit-unicode-ranges.inc.h | 14 +++++++ 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/jerry-core/ecma/base/ecma-helpers-conversion.cpp b/jerry-core/ecma/base/ecma-helpers-conversion.cpp index b84fa0b84..ce344dd70 100644 --- a/jerry-core/ecma/base/ecma-helpers-conversion.cpp +++ b/jerry-core/ecma/base/ecma-helpers-conversion.cpp @@ -23,6 +23,7 @@ #include "ecma-globals.h" #include "ecma-helpers.h" #include "jrt-libc-includes.h" +#include "lit-char-helpers.h" #include "lit-magic-strings.h" /* @@ -343,7 +344,6 @@ ecma_utf8_string_to_number (const lit_utf8_byte_t *str_p, /**< utf-8 string */ const lit_utf8_byte_t hex_lower_digits_range[10] = { 'a', 'f' }; const lit_utf8_byte_t hex_upper_digits_range[10] = { 'A', 'F' }; const lit_utf8_byte_t hex_x_chars[2] = { 'x', 'X' }; - const lit_utf8_byte_t white_space[2] = { ' ', '\n' }; const lit_utf8_byte_t e_chars[2] = { 'e', 'E' }; const lit_utf8_byte_t plus_char = '+'; const lit_utf8_byte_t minus_char = '-'; @@ -354,23 +354,43 @@ ecma_utf8_string_to_number (const lit_utf8_byte_t *str_p, /**< utf-8 string */ return ECMA_NUMBER_ZERO; } - const lit_utf8_byte_t *begin_p = str_p; - const lit_utf8_byte_t *end_p = begin_p + str_size - 1; + lit_utf8_iterator_t iter = lit_utf8_iterator_create (str_p, str_size); + ecma_char_t code_unit; - while (begin_p <= end_p - && (*begin_p == white_space[0] - || *begin_p == white_space[1])) + while (!lit_utf8_iterator_is_eos (&iter)) { - begin_p++; + code_unit = lit_utf8_iterator_peek_next (&iter); + if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit)) + { + lit_utf8_iterator_incr (&iter); + } + else + { + break; + } } - while (begin_p <= end_p - && (*end_p == white_space[0] - || *end_p == white_space[1])) + JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle); + const lit_utf8_byte_t *begin_p = iter.buf_p + iter.buf_pos.offset; + + iter = lit_utf8_iterator_create (iter.buf_p + iter.buf_pos.offset, str_size - iter.buf_pos.offset); + lit_utf8_iterator_seek_eos (&iter); + while (!lit_utf8_iterator_is_bos (&iter)) { - end_p--; + code_unit = lit_utf8_iterator_peek_prev (&iter); + if (lit_char_is_white_space (code_unit) || lit_char_is_line_terminator (code_unit)) + { + lit_utf8_iterator_decr (&iter); + } + else + { + break; + } } + JERRY_ASSERT (!iter.buf_pos.is_non_bmp_middle); + const lit_utf8_byte_t *end_p = iter.buf_p + iter.buf_pos.offset - 1; + if (begin_p > end_p) { return ECMA_NUMBER_ZERO; diff --git a/jerry-core/lit/lit-unicode-ranges.inc.h b/jerry-core/lit/lit-unicode-ranges.inc.h index de7cb7d31..233343b89 100644 --- a/jerry-core/lit/lit-unicode-ranges.inc.h +++ b/jerry-core/lit/lit-unicode-ranges.inc.h @@ -21,6 +21,9 @@ * http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html * * The range lists were generated using tools/print-unicode-ranges.sh script from UnicodeData-3.0.0.txt + * + * NOTE: + * Some ranges in "Separator, Space" category were added manually, see the according definitions. */ /** @@ -2430,6 +2433,17 @@ LIT_UNICODE_RANGE_NO (0x3280, 0x3289) /* CIRCLED IDEOGRAPH ONE #ifndef LIT_UNICODE_RANGE_ZS # define LIT_UNICODE_RANGE_ZS(range_begin, range_end) #endif /* !LIT_UNICODE_RANGE_ZS */ + +LIT_UNICODE_RANGE_ZS (0x180E, 0x180E) /* MONGOLIAN VOWEL SEPARATOR (manually added) + * This character doesn't belong to Zs category according + * UnicodeData-3.0.0.txt, but it should be supported according to + * ch09/9.3/9.3.1/S9.3.1_A2.js form test262 suite. */ + +LIT_UNICODE_RANGE_ZS (0x205F, 0x205F) /* MEDIUM MATHEMATICAL SPACE (manually added) + * This character doesn't belong to Zs category according + * UnicodeData-3.0.0.txt, but it should be supported according to + * ch09/9.3/9.3.1/S9.3.1_A2.js form test262 suite. */ + LIT_UNICODE_RANGE_ZS (0x0020, 0x0020) /* SPACE */ LIT_UNICODE_RANGE_ZS (0x00A0, 0x00A0) /* NO-BREAK SPACE */