Refactor ecma_builtin_global_object_unescape (#4115)

based on ECMA-262 v11, B.2.1.2 Fixed tests from the exclude list: * annexB/built-ins/unescape/four-ignore-bad-u.js * annexB/built-ins/unescape/four.js * annexB/built-ins/unescape/two.js JerryScript-DCO-1.0-Signed-off-by: Adam Szilagyi aszilagy@inf.u-szeged.hu
2025-12-15 16:29:21 +00:00 · 2021-01-14 16:16:32 +01:00 · 2021-01-14 16:16:32 +01:00 · df6d430289
commit df6d430289
parent 4541524256
3 changed files with 189 additions and 70 deletions
--- a/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
+++ b/jerry-core/ecma/builtin-objects/ecma-builtin-global.c
@ -494,11 +494,49 @@ ecma_builtin_global_object_escape (lit_utf8_byte_t *input_start_p, /**< routine'
  return ecma_make_string_value (ecma_stringbuilder_finalize (&builder));
 } /* ecma_builtin_global_object_escape */

+/**
+ * Utility method to resolve character sequences for the 'unescape' method.
+ *
+ * Expected formats: %uxxxx or %yy
+ *
+ * @return number of characters processed during the escape resolve
+ */
+static uint8_t
+ecma_builtin_global_object_unescape_resolve_escape (const lit_utf8_byte_t *buffer_p,  /**< character buffer */
+                                                    bool unicode_sequence, /**< true if unescaping unicode sequence */
+                                                    ecma_char_t *out_result_p) /**< [out] resolved character */
+{
+  JERRY_ASSERT (buffer_p != NULL);
+  JERRY_ASSERT (out_result_p != NULL);
+
+  ecma_char_t unescaped_chr = 0;
+  uint8_t sequence_length = unicode_sequence ? 5 : 2;
+  uint8_t start = unicode_sequence ? 1 : 0;
+
+  for (uint8_t i = start; i < sequence_length; i++)
+  {
+    const lit_utf8_byte_t current_char = buffer_p[i];
+
+    if (!lit_char_is_hex_digit (current_char))
+    {
+      /* This was not an escape sequence, skip processing */
+      return 0;
+    }
+
+    unescaped_chr = (ecma_char_t) ((unescaped_chr << 4) + (ecma_char_t) lit_char_hex_to_int (current_char));
+  }
+
+  *out_result_p = unescaped_chr;
+
+  return sequence_length;
+} /* ecma_builtin_global_object_unescape_resolve_escape */
+
 /**
 * The Global object's 'unescape' routine
 *
 * See also:
 *          ECMA-262 v5, B.2.2
+ *          ECMA-262 v11, B.2.1.2
 *
 * @return ecma value
 *         Returned value must be freed with ecma_free_value.
@ -509,76 +547,40 @@ ecma_builtin_global_object_unescape (lit_utf8_byte_t *input_start_p, /**< routin
                                     lit_utf8_size_t input_size) /**< routine's first argument's
                                                                  *   string buffer's size */
 {
-  const lit_utf8_byte_t *input_curr_p = input_start_p;
-  const lit_utf8_byte_t *input_end_p = input_start_p + input_size;
-  /* 4. */
-  /* The length of input string is always greater than output string
-   * so we re-use the input string buffer.
-   * The %xx is three byte long, and the maximum encoded value is 0xff,
-   * which maximum encoded length is two byte. Similar to this, the maximum
-   * encoded length of %uxxxx is four byte. */
-  lit_utf8_byte_t *output_char_p = input_start_p;
-
-  /* The state of parsing that tells us where we are in an escape pattern.
-   * 0    we are outside of pattern,
-   * 1    found '%', start of pattern,
-   * 2    found first hex digit of '%xy' pattern
-   * 3    found valid '%xy' pattern
-   * 4    found 'u', start of '%uwxyz' pattern
-   * 5-7  found hex digits of '%uwxyz' pattern
-   * 8    found valid '%uwxyz' pattern
-   */
-  uint8_t status = 0;
-  ecma_char_t hex_digits = 0;
-  /* 5. */
-  while (input_curr_p < input_end_p)
+  if (input_size == 0)
  {
-    /* 6. */
-    ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
-
-    /* 7-8. */
-    if (status == 0 && chr == LIT_CHAR_PERCENT)
-    {
-      /* Found '%' char, start of escape sequence. */
-      status = 1;
-    }
-    /* 9-10. */
-    else if (status == 1 && chr == LIT_CHAR_LOWERCASE_U)
-    {
-      /* Found 'u' char after '%'. */
-      status = 4;
-    }
-    else if (status > 0 && lit_char_is_hex_digit (chr))
-    {
-      /* Found hexadecimal digit in escape sequence. */
-      hex_digits = (ecma_char_t) (hex_digits * 16 + (ecma_char_t) lit_char_hex_to_int (chr));
-      status++;
-    }
-    else
-    {
-      /* Previously found hexadecimal digit in escape sequence but it's not valid '%xy' pattern
-       * so essentially it was only a simple character. */
-      status = 0;
-    }
-
-    /* 11-17. Found valid '%uwxyz' or '%xy' escape. */
-    if (status == 8 || status == 3)
-    {
-      output_char_p -= (status == 3) ? 2 : 5;
-      status = 0;
-      chr = hex_digits;
-      hex_digits = 0;
-    }
-
-    /* Copying character. */
-    lit_utf8_size_t lit_size = lit_code_unit_to_utf8 (chr, output_char_p);
-    output_char_p += lit_size;
-    JERRY_ASSERT (output_char_p <= input_curr_p);
+    return ecma_make_magic_string_value (LIT_MAGIC_STRING__EMPTY);
  }

-  lit_utf8_size_t output_length = (lit_utf8_size_t) (output_char_p - input_start_p);
-  ecma_string_t *output_string_p = ecma_new_ecma_string_from_utf8 (input_start_p, output_length);
-  return ecma_make_string_value (output_string_p);
+  const lit_utf8_byte_t *input_curr_p = input_start_p;
+  const lit_utf8_byte_t *input_end_p = input_start_p + input_size;
+  ecma_stringbuilder_t builder = ecma_stringbuilder_create ();
+
+  while (input_curr_p < input_end_p)
+  {
+    ecma_char_t chr = lit_cesu8_read_next (&input_curr_p);
+
+    // potential pattern
+    if (chr == LIT_CHAR_PERCENT)
+    {
+      const lit_utf8_size_t chars_leftover = (lit_utf8_size_t) (input_end_p - input_curr_p);
+
+      // potential unicode sequence
+      if (chars_leftover >= 5 && input_curr_p[0] == LIT_CHAR_LOWERCASE_U)
+      {
+        input_curr_p += ecma_builtin_global_object_unescape_resolve_escape (input_curr_p, true, &chr);
+      }
+      // potential two hexa sequence
+      else if (chars_leftover >= 2)
+      {
+        input_curr_p += ecma_builtin_global_object_unescape_resolve_escape (input_curr_p, false, &chr);
+      }
+    }
+
+    ecma_stringbuilder_append_char (&builder, chr);
+  }
+
+  return ecma_make_string_value (ecma_stringbuilder_finalize (&builder));
 } /* ecma_builtin_global_object_unescape */

 #endif /* ENABLED (JERRY_BUILTIN_ANNEXB) */
--- a/tests/jerry/es.next/global-unescape.js
+++ b/tests/jerry/es.next/global-unescape.js
@ -0,0 +1,120 @@
+// Copyright JS Foundation and other contributors, http://js.foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (C) 2016 the V8 project authors. All rights reserved.
+// This code is governed by the BSD license found in the LICENSE file.
+
+assert(unescape('%U0000') === '%U0000');
+assert(unescape('%t0000') === '%t0000');
+assert(unescape('%v0000') ==='%v0000');
+assert(unescape('%%0000') === '%\x0000');
+
+// tests for two hexa unescape
+assert(unescape('%0%0000') === '%0\x0000');
+assert(unescape('%0%0100') === '%0\x0100');
+
+assert(unescape('%0%2900') === '%0)00');
+assert(unescape('%0%2a00') === '%0*00');
+assert(unescape('%0%2A00') === '%0*00');
+assert(unescape('%0%2b00') === '%0+00');
+assert(unescape('%0%2B00') === '%0+00');
+assert(unescape('%0%2c00') === '%0,00');
+assert(unescape('%0%2C00') === '%0,00');
+assert(unescape('%0%2d00') === '%0-00');
+assert(unescape('%0%2D00') === '%0-00');
+
+assert(unescape('%0%3900') === '%0900');
+assert(unescape('%0%3a00') === '%0:00');
+assert(unescape('%0%3A00') === '%0:00');
+
+assert(unescape('%0%3f00') === '%0?00');
+assert(unescape('%0%3F00') === '%0?00');
+assert(unescape('%0%4000') === '%0@00');
+
+assert(unescape('%0%5a00') === '%0Z00');
+assert(unescape('%0%5A00') === '%0Z00');
+assert(unescape('%0%5b00') === '%0[00');
+assert(unescape('%0%5B00') === '%0[00');
+
+assert(unescape('%0%5e00') === '%0^00');
+assert(unescape('%0%5E00') === '%0^00');
+assert(unescape('%0%5f00') === '%0_00');
+assert(unescape('%0%5F00') === '%0_00');
+assert(unescape('%0%6000') === '%0`00');
+assert(unescape('%0%6100') === '%0a00');
+
+assert(unescape('%0%7a00') === '%0z00');
+assert(unescape('%0%7A00') === '%0z00');
+assert(unescape('%0%7b00') === '%0{00');
+assert(unescape('%0%7B00') === '%0{00');
+
+assert(unescape('%0%fe00') === '%0\xfe00');
+assert(unescape('%0%Fe00') === '%0\xfe00');
+assert(unescape('%0%fE00') === '%0\xfe00');
+assert(unescape('%0%FE00') === '%0\xfe00');
+
+assert(unescape('%0%ff00') === '%0\xff00');
+assert(unescape('%0%Ff00') === '%0\xff00');
+assert(unescape('%0%fF00') === '%0\xff00');
+assert(unescape('%0%FF00') === '%0\xff00');
+
+// tests for unicode unescape
+assert(unescape('%0%u00290') === '%0)0');
+assert(unescape('%0%u002a0') === '%0*0');
+assert(unescape('%0%u002A0') === '%0*0');
+assert(unescape('%0%u002b0') === '%0+0');
+assert(unescape('%0%u002B0') === '%0+0');
+assert(unescape('%0%u002c0') === '%0,0');
+assert(unescape('%0%u002C0') === '%0,0');
+assert(unescape('%0%u002d0') === '%0-0');
+assert(unescape('%0%u002D0') === '%0-0');
+
+assert(unescape('%0%u00390') === '%090');
+assert(unescape('%0%u003a0') === '%0:0');
+assert(unescape('%0%u003A0') === '%0:0');
+
+assert(unescape('%0%u003f0') === '%0?0');
+assert(unescape('%0%u003F0') === '%0?0');
+assert(unescape('%0%u00400') === '%0@0');
+
+assert(unescape('%0%u005a0') === '%0Z0');
+assert(unescape('%0%u005A0') === '%0Z0');
+assert(unescape('%0%u005b0') === '%0[0');
+assert(unescape('%0%u005B0') === '%0[0');
+
+assert(unescape('%0%u005e0') === '%0^0');
+assert(unescape('%0%u005E0') === '%0^0');
+assert(unescape('%0%u005f0') === '%0_0');
+assert(unescape('%0%u005F0') === '%0_0');
+assert(unescape('%0%u00600') === '%0`0');
+assert(unescape('%0%u00610') === '%0a0');
+
+assert(unescape('%0%u007a0') === '%0z0');
+assert(unescape('%0%u007A0') === '%0z0');
+assert(unescape('%0%u007b0') === '%0{0');
+assert(unescape('%0%u007B0') === '%0{0');
+
+assert(unescape('%0%ufffe0') === '%0\ufffe0');
+assert(unescape('%0%uFffe0') === '%0\ufffe0');
+assert(unescape('%0%ufFfe0') === '%0\ufffe0');
+assert(unescape('%0%uffFe0') === '%0\ufffe0');
+assert(unescape('%0%ufffE0') === '%0\ufffe0');
+assert(unescape('%0%uFFFE0') === '%0\ufffe0');
+
+assert(unescape('%0%uffff0') === '%0\uffff0');
+assert(unescape('%0%uFfff0') === '%0\uffff0');
+assert(unescape('%0%ufFff0') === '%0\uffff0');
+assert(unescape('%0%uffFf0') === '%0\uffff0');
+assert(unescape('%0%ufffF0') === '%0\uffff0');
+assert(unescape('%0%uFFFF0') === '%0\uffff0');
--- a/tests/test262-esnext-excludelist.xml
+++ b/tests/test262-esnext-excludelist.xml
@ -1168,9 +1168,6 @@
  <test id="annexB/built-ins/String/prototype/sup/name.js"><reason></reason></test>
  <test id="annexB/built-ins/String/prototype/sup/prop-desc.js"><reason></reason></test>
  <test id="annexB/built-ins/String/prototype/sup/this-val-tostring-err.js"><reason></reason></test>
-  <test id="annexB/built-ins/unescape/four-ignore-bad-u.js"><reason></reason></test>
-  <test id="annexB/built-ins/unescape/four.js"><reason></reason></test>
-  <test id="annexB/built-ins/unescape/two.js"><reason></reason></test>
  <test id="annexB/language/comments/multi-line-html-close.js"><reason></reason></test>
  <test id="annexB/language/comments/single-line-html-close-asi.js"><reason></reason></test>
  <test id="annexB/language/comments/single-line-html-close-unicode-separators.js"><reason></reason></test>