Support \u200C \u200D unicode characters (#3266)

JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
This commit is contained in:
Robert Fancsik 2019-10-31 11:14:13 +01:00 committed by GitHub
parent 6a342fcdd6
commit eee41ec734
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 10 deletions

View File

@ -171,11 +171,11 @@ static const uint16_t lit_unicode_non_letter_ident_part_interval_sps[] JERRY_ATT
0x17e0, 0x180b, 0x1810, 0x1885, 0x1920, 0x1930, 0x1946, 0x19d0, 0x1a17, 0x1a55,
0x1a60, 0x1a7f, 0x1a90, 0x1ab0, 0x1b00, 0x1b34, 0x1b50, 0x1b6b, 0x1b80, 0x1ba1,
0x1bb0, 0x1be6, 0x1c24, 0x1c40, 0x1c50, 0x1cd0, 0x1cd4, 0x1cf2, 0x1cf8, 0x1dc0,
0x1dfb, 0x203f, 0x20d0, 0x20e5, 0x2cef, 0x2de0, 0x302a, 0x3099, 0xa620, 0xa674,
0xa69e, 0xa6f0, 0xa823, 0xa880, 0xa8b4, 0xa8d0, 0xa8e0, 0xa900, 0xa926, 0xa947,
0xa980, 0xa9b3, 0xa9d0, 0xa9f0, 0xaa29, 0xaa4c, 0xaa50, 0xaa7b, 0xaab2, 0xaab7,
0xaabe, 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xabf0, 0xfe00, 0xfe20, 0xfe33, 0xfe4d,
0xff10
0x1dfb, 0x200c, 0x203f, 0x20d0, 0x20e5, 0x2cef, 0x2de0, 0x302a, 0x3099, 0xa620,
0xa674, 0xa69e, 0xa6f0, 0xa823, 0xa880, 0xa8b4, 0xa8d0, 0xa8e0, 0xa900, 0xa926,
0xa947, 0xa980, 0xa9b3, 0xa9d0, 0xa9f0, 0xaa29, 0xaa4c, 0xaa50, 0xaa7b, 0xaab2,
0xaab7, 0xaabe, 0xaaeb, 0xaaf5, 0xabe3, 0xabec, 0xabf0, 0xfe00, 0xfe20, 0xfe33,
0xfe4d, 0xff10
};
/**
@ -201,11 +201,11 @@ static const uint8_t lit_unicode_non_letter_ident_part_interval_lengths[] JERRY_
0x0009, 0x0002, 0x0009, 0x0001, 0x000b, 0x000b, 0x0009, 0x0009, 0x0004, 0x0009,
0x001c, 0x000a, 0x0009, 0x000d, 0x0004, 0x0010, 0x0009, 0x0008, 0x0002, 0x000c,
0x0009, 0x000d, 0x0013, 0x0009, 0x0009, 0x0002, 0x0014, 0x0002, 0x0001, 0x0035,
0x0004, 0x0001, 0x000c, 0x000b, 0x0002, 0x001f, 0x0005, 0x0001, 0x0009, 0x0009,
0x0001, 0x0001, 0x0004, 0x0001, 0x0011, 0x0009, 0x0011, 0x0009, 0x0007, 0x000c,
0x0003, 0x000d, 0x0009, 0x0009, 0x000d, 0x0001, 0x0009, 0x0002, 0x0002, 0x0001,
0x0001, 0x0004, 0x0001, 0x0007, 0x0001, 0x0009, 0x000f, 0x000f, 0x0001, 0x0002,
0x0009
0x0004, 0x0001, 0x0001, 0x000c, 0x000b, 0x0002, 0x001f, 0x0005, 0x0001, 0x0009,
0x0009, 0x0001, 0x0001, 0x0004, 0x0001, 0x0011, 0x0009, 0x0011, 0x0009, 0x0007,
0x000c, 0x0003, 0x000d, 0x0009, 0x0009, 0x000d, 0x0001, 0x0009, 0x0002, 0x0002,
0x0001, 0x0001, 0x0004, 0x0001, 0x0007, 0x0001, 0x0009, 0x000f, 0x000f, 0x0001,
0x0002, 0x0009
};
/**

View File

@ -0,0 +1,31 @@
// Copyright JS Foundation and other contributors, http://js.foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
function checkSyntax (str) {
try {
eval (str);
assert (false);
} catch (e) {
assert (e instanceof SyntaxError);
}
}
// Only \u200C-Zero width non-joiner, and \u200D-Zero width joiner are allowed
checkSyntax ("_\u200b\u200d");
checkSyntax ("_\u200c\u200e");
var _\u200c\u200d = 5;
assert (_\u200c\u200d === 5);

View File

@ -137,6 +137,14 @@ class UnicodeCategorizer(object):
if zero_width_space not in separators:
bisect.insort(separators, int(zero_width_space))
# https://www.ecma-international.org/ecma-262/5.1/#sec-7.1 format-control characters
non_letters = self._categories['non_letters']
zero_width_non_joiner = 0x200C
zero_width_joiner = 0x200D
bisect.insort(non_letters, int(zero_width_non_joiner))
bisect.insort(non_letters, int(zero_width_joiner))
return self._categories['letters'], self._categories['non_letters'], self._categories['separators']