Update RegExp unicode mode case folding to conform to the standard (#4004)

JerryScript-DCO-1.0-Signed-off-by: Dániel Bátyai daniel.batyai@h-lab.eu
This commit is contained in:
Dániel Bátyai 2020-07-20 15:51:43 +02:00 committed by GitHub
parent 33359ac506
commit 321215fdbb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 284 additions and 84 deletions

View File

@ -403,30 +403,43 @@ lit_code_point_t
ecma_regexp_canonicalize_char (lit_code_point_t ch, /**< character */
bool unicode) /**< unicode */
{
if (JERRY_LIKELY (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX))
#if ENABLED (JERRY_ESNEXT)
if (unicode)
{
if (ch >= LIT_CHAR_LOWERCASE_A && ch <= LIT_CHAR_LOWERCASE_Z)
/* In unicode mode the mappings contained in the CaseFolding.txt file should be used to canonicalize the character.
* These mappings generally correspond to the lowercase variant of the character, however there are some
* differences. In some cases the uppercase variant is used, in others the lowercase of the uppercase character is
* used, and there are also cases where the character has no case folding mapping even though it has upper/lower
* variants. Since lowercasing is the most common this is used as the default behaviour, and characters with
* differing behaviours are encoded in lookup tables. */
if (lit_char_fold_to_upper (ch))
{
return (ecma_char_t) (ch - (LIT_CHAR_LOWERCASE_A - LIT_CHAR_UPPERCASE_A));
ch = lit_char_to_upper_case (ch, NULL);
JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
}
if (lit_char_fold_to_lower (ch))
{
ch = lit_char_to_lower_case (ch, NULL);
JERRY_ASSERT (ch != LIT_MULTIPLE_CU);
}
return ch;
}
#endif /* !ENABLED (JERRY_ESNEXT) */
JERRY_UNUSED (unicode);
lit_code_point_t cu = lit_char_to_upper_case (ch, NULL);
if (cu == LIT_MULTIPLE_CU)
if (ch <= LIT_UTF8_1_BYTE_CODE_POINT_MAX
|| (cu > LIT_UTF8_1_BYTE_CODE_POINT_MAX
&& cu != LIT_MULTIPLE_CU))
{
return ch;
return cu;
}
if (cu <= LIT_UTF8_1_BYTE_CODE_POINT_MAX && !unicode)
{
/* 6. */
return ch;
}
return cu;
return ch;
} /* ecma_regexp_canonicalize_char */
/**

View File

@ -23,6 +23,9 @@
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
#include "lit-unicode-conversions.inc.h"
#include "lit-unicode-conversions-sup.inc.h"
#if ENABLED (JERRY_ESNEXT)
#include "lit-unicode-folding.inc.h"
#endif /* ENABLED (JERRY_ESNEXT) */
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
#define NUM_OF_ELEMENTS(array) (sizeof (array) / sizeof ((array)[0]))
@ -914,3 +917,51 @@ lit_char_to_upper_case (lit_code_point_t cp, /**< code point */
return cp;
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
} /* lit_char_to_upper_case */
#if ENABLED (JERRY_ESNEXT)
/*
* Look up whether the character should be folded to the lowercase variant.
*
* @return true, if character should be lowercased
* false, otherwise
*/
bool
lit_char_fold_to_lower (lit_code_point_t cp) /**< code point */
{
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
return (cp > LIT_UTF16_CODE_UNIT_MAX
|| (!lit_search_char_in_interval_array ((ecma_char_t) cp,
lit_unicode_folding_skip_to_lower_interval_starts,
lit_unicode_folding_skip_to_lower_interval_lengths,
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_interval_starts))
&& !lit_search_char_in_array ((ecma_char_t) cp,
lit_unicode_folding_skip_to_lower_chars,
NUM_OF_ELEMENTS (lit_unicode_folding_skip_to_lower_chars))));
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
return true;
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
} /* lit_char_fold_to_lower */
/*
* Look up whether the character should be folded to the uppercase variant.
*
* @return true, if character should be uppercased
* false, otherwise
*/
bool
lit_char_fold_to_upper (lit_code_point_t cp) /**< code point */
{
#if ENABLED (JERRY_UNICODE_CASE_CONVERSION)
return (cp <= LIT_UTF16_CODE_UNIT_MAX
&& (lit_search_char_in_interval_array ((ecma_char_t) cp,
lit_unicode_folding_to_upper_interval_starts,
lit_unicode_folding_to_upper_interval_lengths,
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_interval_starts))
|| lit_search_char_in_array ((ecma_char_t) cp,
lit_unicode_folding_to_upper_chars,
NUM_OF_ELEMENTS (lit_unicode_folding_to_upper_chars))));
#else /* !ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
return false;
#endif /* ENABLED (JERRY_UNICODE_CASE_CONVERSION) */
} /* lit_char_fold_to_upper */
#endif /* ENABLED (JERRY_ESNEXT) */

View File

@ -248,4 +248,9 @@ bool lit_char_is_word_char (lit_code_point_t c);
lit_code_point_t lit_char_to_lower_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
lit_code_point_t lit_char_to_upper_case (lit_code_point_t cp, ecma_stringbuilder_t *builder_p);
#if ENABLED (JERRY_ESNEXT)
bool lit_char_fold_to_lower (lit_code_point_t cp);
bool lit_char_fold_to_upper (lit_code_point_t cp);
#endif /* ENABLED (JERRY_ESNEXT) */
#endif /* !LIT_CHAR_HELPERS_H */

View File

@ -96,68 +96,61 @@ static const uint8_t lit_unicode_upper_case_special_range_lengths[] JERRY_ATTR_C
/* Contains start points of lowercase ranges. */
static const uint16_t lit_unicode_lower_case_ranges[] JERRY_ATTR_CONST_DATA =
{
0x1e96, 0x1e96, 0x1f80, 0x1f80, 0x1f88, 0x1f80, 0x1f90, 0x1f90, 0x1f98, 0x1f90,
0x1fa0, 0x1fa0, 0x1fa8, 0x1fa0, 0x1fb2, 0x1fb2, 0x1fb6, 0x1fb6, 0x1fc2, 0x1fc2,
0x1fc6, 0x1fc6, 0x1fd2, 0x1fd2, 0x1fd6, 0x1fd6, 0x1fe2, 0x1fe2, 0x1fe6, 0x1fe6,
0x1ff2, 0x1ff2, 0x1ff6, 0x1ff6, 0xfb00, 0xfb00, 0xfb13, 0xfb13
0x1f88, 0x1f80, 0x1f98, 0x1f90, 0x1fa8, 0x1fa0
};
/* Interval lengths for start points in `lower_case_ranges` table. */
static const uint8_t lit_unicode_lower_case_range_lengths[] JERRY_ATTR_CONST_DATA =
{
0x0005, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0003, 0x0002, 0x0003,
0x0002, 0x0002, 0x0002, 0x0003, 0x0002, 0x0003, 0x0002, 0x0007, 0x0005
0x0008, 0x0008, 0x0008
};
/* The remaining lowercase conversions. The lowercase variant can be one-to-three character long. */
static const uint16_t lit_unicode_lower_case_conversions[] JERRY_ATTR_CONST_DATA =
{
0x00df, 0x00df, 0x0149, 0x0149, 0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc,
0x01f0, 0x01f0, 0x01f2, 0x01f3, 0x0390, 0x0390, 0x03b0, 0x03b0, 0x03f4, 0x03b8,
0x0587, 0x0587, 0x1e9e, 0x00df, 0x1f50, 0x1f50, 0x1f52, 0x1f52, 0x1f54, 0x1f54,
0x1f56, 0x1f56, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
0x01c5, 0x01c6, 0x01c8, 0x01c9, 0x01cb, 0x01cc, 0x01f2, 0x01f3, 0x03f4, 0x03b8,
0x1e9e, 0x00df, 0x1fbc, 0x1fb3, 0x1fcc, 0x1fc3, 0x1ffc, 0x1ff3, 0x2126, 0x03c9,
0x212a, 0x006b, 0x212b, 0x00e5, 0x0130, 0x0069, 0x0307
};
/* Number of one-to-one, one-to-two, and one-to-three lowercase conversions. */
static const uint8_t lit_unicode_lower_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
{
0x0016, 0x0001, 0x0000
0x000c, 0x0001, 0x0000
};
/* The remaining uppercase conversions. The uppercase variant can be one-to-three character long. */
static const uint16_t lit_unicode_upper_case_conversions[] JERRY_ATTR_CONST_DATA =
{
0x00b5, 0x039c, 0x0130, 0x0130, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4,
0x01c8, 0x01c7, 0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3,
0x03d0, 0x0392, 0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a,
0x03f1, 0x03a1, 0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e,
0x1c83, 0x0421, 0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462,
0x1c88, 0xa64a, 0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149,
0x02bc, 0x004e, 0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048,
0x0331, 0x1e97, 0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a,
0x1e9a, 0x0041, 0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f,
0x1f0f, 0x0399, 0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f,
0x0399, 0x1faf, 0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399,
0x1fb4, 0x0386, 0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2,
0x1fca, 0x0399, 0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397,
0x0342, 0x1fcc, 0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313,
0x1fe6, 0x03a5, 0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4,
0x038f, 0x0399, 0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046,
0x0046, 0xfb01, 0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054,
0xfb06, 0x0053, 0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15,
0x0544, 0x053b, 0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399,
0x0308, 0x0301, 0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300,
0x1f54, 0x03a5, 0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391,
0x0342, 0x0399, 0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300,
0x1fd3, 0x0399, 0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5,
0x0308, 0x0300, 0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342,
0x1ff7, 0x03a9, 0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046,
0x0046, 0x004c
0x00b5, 0x039c, 0x0131, 0x0049, 0x017f, 0x0053, 0x01c5, 0x01c4, 0x01c8, 0x01c7,
0x01cb, 0x01ca, 0x01f2, 0x01f1, 0x0345, 0x0399, 0x03c2, 0x03a3, 0x03d0, 0x0392,
0x03d1, 0x0398, 0x03d5, 0x03a6, 0x03d6, 0x03a0, 0x03f0, 0x039a, 0x03f1, 0x03a1,
0x03f5, 0x0395, 0x1c80, 0x0412, 0x1c81, 0x0414, 0x1c82, 0x041e, 0x1c83, 0x0421,
0x1c84, 0x0422, 0x1c85, 0x0422, 0x1c86, 0x042a, 0x1c87, 0x0462, 0x1c88, 0xa64a,
0x1e9b, 0x1e60, 0x1fbe, 0x0399, 0x00df, 0x0053, 0x0053, 0x0149, 0x02bc, 0x004e,
0x01f0, 0x004a, 0x030c, 0x0587, 0x0535, 0x0552, 0x1e96, 0x0048, 0x0331, 0x1e97,
0x0054, 0x0308, 0x1e98, 0x0057, 0x030a, 0x1e99, 0x0059, 0x030a, 0x1e9a, 0x0041,
0x02be, 0x1f50, 0x03a5, 0x0313, 0x1f87, 0x1f0f, 0x0399, 0x1f8f, 0x1f0f, 0x0399,
0x1f97, 0x1f2f, 0x0399, 0x1f9f, 0x1f2f, 0x0399, 0x1fa7, 0x1f6f, 0x0399, 0x1faf,
0x1f6f, 0x0399, 0x1fb2, 0x1fba, 0x0399, 0x1fb3, 0x0391, 0x0399, 0x1fb4, 0x0386,
0x0399, 0x1fb6, 0x0391, 0x0342, 0x1fbc, 0x0391, 0x0399, 0x1fc2, 0x1fca, 0x0399,
0x1fc3, 0x0397, 0x0399, 0x1fc4, 0x0389, 0x0399, 0x1fc6, 0x0397, 0x0342, 0x1fcc,
0x0397, 0x0399, 0x1fd6, 0x0399, 0x0342, 0x1fe4, 0x03a1, 0x0313, 0x1fe6, 0x03a5,
0x0342, 0x1ff2, 0x1ffa, 0x0399, 0x1ff3, 0x03a9, 0x0399, 0x1ff4, 0x038f, 0x0399,
0x1ff6, 0x03a9, 0x0342, 0x1ffc, 0x03a9, 0x0399, 0xfb00, 0x0046, 0x0046, 0xfb01,
0x0046, 0x0049, 0xfb02, 0x0046, 0x004c, 0xfb05, 0x0053, 0x0054, 0xfb06, 0x0053,
0x0054, 0xfb13, 0x0544, 0x0546, 0xfb14, 0x0544, 0x0535, 0xfb15, 0x0544, 0x053b,
0xfb16, 0x054e, 0x0546, 0xfb17, 0x0544, 0x053d, 0x0390, 0x0399, 0x0308, 0x0301,
0x03b0, 0x03a5, 0x0308, 0x0301, 0x1f52, 0x03a5, 0x0313, 0x0300, 0x1f54, 0x03a5,
0x0313, 0x0301, 0x1f56, 0x03a5, 0x0313, 0x0342, 0x1fb7, 0x0391, 0x0342, 0x0399,
0x1fc7, 0x0397, 0x0342, 0x0399, 0x1fd2, 0x0399, 0x0308, 0x0300, 0x1fd3, 0x0399,
0x0308, 0x0301, 0x1fd7, 0x0399, 0x0308, 0x0342, 0x1fe2, 0x03a5, 0x0308, 0x0300,
0x1fe3, 0x03a5, 0x0308, 0x0301, 0x1fe7, 0x03a5, 0x0308, 0x0342, 0x1ff7, 0x03a9,
0x0342, 0x0399, 0xfb03, 0x0046, 0x0046, 0x0049, 0xfb04, 0x0046, 0x0046, 0x004c
};
/* Number of one-to-one, one-to-two, and one-to-three uppercase conversions. */
static const uint8_t lit_unicode_upper_case_conversion_counters[] JERRY_ATTR_CONST_DATA =
{
0x001c, 0x002c, 0x0010
0x001b, 0x002c, 0x0010
};

View File

@ -0,0 +1,65 @@
/* Copyright JS Foundation and other contributors, http://js.foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* This file is automatically generated by the gen-unicode.py script
* from the CaseFolding.txt file. Do not edit! */
/**
* Character interval starting points for folding_skip_to_lower.
*/
static const uint16_t lit_unicode_folding_skip_to_lower_interval_starts[] JERRY_ATTR_CONST_DATA =
{
0x13a0, 0x13f8, 0xab70
};
/**
* Character interval lengths for folding_skip_to_lower.
*/
static const uint8_t lit_unicode_folding_skip_to_lower_interval_lengths[] JERRY_ATTR_CONST_DATA =
{
0x0055, 0x0005, 0x004f
};
/**
* Non-interval characters for folding_skip_to_lower.
*/
static const uint16_t lit_unicode_folding_skip_to_lower_chars[] JERRY_ATTR_CONST_DATA =
{
0x0130
};
/**
* Character interval starting points for folding_to_upper.
*/
static const uint16_t lit_unicode_folding_to_upper_interval_starts[] JERRY_ATTR_CONST_DATA =
{
0x03d0, 0x03d5, 0x03f0, 0x13f8, 0x1c80, 0xab70
};
/**
* Character interval lengths for folding_to_upper.
*/
static const uint8_t lit_unicode_folding_to_upper_interval_lengths[] JERRY_ATTR_CONST_DATA =
{
0x0001, 0x0001, 0x0001, 0x0005, 0x0008, 0x004f
};
/**
* Non-interval characters for folding_to_upper.
*/
static const uint16_t lit_unicode_folding_to_upper_chars[] JERRY_ATTR_CONST_DATA =
{
0x00b5, 0x017f, 0x0345, 0x03c2, 0x03f5, 0x1e9b, 0x1fbe
};

View File

@ -359,3 +359,8 @@ try {
} catch (e) {
assert (e instanceof SyntaxError);
}
assert(/\w/iu.test("ſ"));
assert(/\w/iu.test("\u212a"));
assert(/k/iu.test("\u212a"));
assert(/\u{10c90}/iu.test("\u{10cd0}"));

View File

@ -338,7 +338,6 @@
<test id="language/expressions/tagged-template/cache-identical-source-new-function.js"><reason></reason></test>
<test id="language/expressions/tagged-template/constructor-invocation.js"><reason></reason></test>
<test id="language/expressions/template-literal/invalid-legacy-octal-escape-sequence.js"><reason></reason></test>
<test id="language/literals/regexp/u-case-mapping.js"><reason></reason></test>
<test id="language/literals/string/7.8.4-1-s.js"><reason></reason></test>
<test id="language/module-code/export-unresolvable.js"><reason></reason></test>
<test id="language/statements/class/definition/methods.js"><reason></reason></test>

View File

@ -27,10 +27,18 @@ from gen_c_source import LICENSE, format_code
from settings import PROJECT_DIR
UNICODE_DATA_FILE = 'UnicodeData.txt'
SPECIAL_CASING_FILE = 'SpecialCasing.txt'
DERIVED_PROPS_FILE = 'DerivedCoreProperties.txt'
PROP_LIST_FILE = 'PropList.txt'
CASE_FOLDING_FILE = 'CaseFolding.txt'
RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')
RANGES_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges-sup.inc.h')
CONVERSIONS_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions.inc.h')
CONVERSIONS_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-conversions-sup.inc.h')
FOLDING_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding.inc.h')
FOLDING_SUP_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-folding-sup.inc.h')
UNICODE_PLANE_TYPE_BASIC = 0
UNICODE_PLANE_TYPE_SUPPLEMENTARY = 1
@ -266,11 +274,14 @@ class UnicodeBasicCategorizer(object):
if not self.in_range(letter_id) or condition_list:
continue
original_letter = parse_unicode_sequence(line[0])
small_letter = parse_unicode_sequence(line[1])
capital_letter = parse_unicode_sequence(line[3])
lower_case_mapping[letter_id] = small_letter
upper_case_mapping[letter_id] = capital_letter
if small_letter != original_letter:
lower_case_mapping[letter_id] = small_letter
if capital_letter != original_letter:
upper_case_mapping[letter_id] = capital_letter
return lower_case_mapping, upper_case_mapping
@ -292,12 +303,13 @@ def generate_ranges(script_args, plane_type):
categorizer = UnicodeBasicCategorizer()
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from %s. Do not edit! */" % os.path.basename(script_args.derived_core_properties),
" * from %s. Do not edit! */" % (DERIVED_PROPS_FILE),
""]
c_source.complete_header("\n".join(header_completion))
units = categorizer.read_units(script_args.derived_core_properties, ["ID_Start", "ID_Continue"])
derived_props_path = os.path.join(script_args.unicode_dir, DERIVED_PROPS_FILE)
units = categorizer.read_units(derived_props_path, ["ID_Start", "ID_Continue"])
units["ID_Continue"] = sorted(set(units["ID_Continue"]).union(categorizer.extra_id_continue_units)
- set(units["ID_Start"]))
@ -305,7 +317,9 @@ def generate_ranges(script_args, plane_type):
for category, unit in units.items():
c_source.add_range(category, categorizer.create_tables(unit))
white_space_units = categorizer.read_units(script_args.prop_list, ["White_Space"], ["Zs"])["White_Space"]
prop_list_path = os.path.join(script_args.unicode_dir, PROP_LIST_FILE)
white_space_units = categorizer.read_units(prop_list_path, ["White_Space"], ["Zs"])["White_Space"]
c_source.add_whitepace_range("White_Space", categorizer, white_space_units)
@ -314,6 +328,19 @@ def generate_ranges(script_args, plane_type):
# functions for unicode conversions
def make_char(hex_val):
"""
Create a unicode character from a hex value
:param hex_val: Hex value of the character.
:return: Unicode character corresponding to the value.
"""
try:
return unichr(hex_val)
except NameError:
return chr(hex_val)
def parse_unicode_sequence(raw_data):
"""
@ -331,10 +358,7 @@ def parse_unicode_sequence(raw_data):
# Convert it to unicode code point (from hex value without 0x prefix)
hex_val = int(unicode_char, 16)
try:
result += unichr(hex_val)
except NameError:
result += chr(hex_val)
result += make_char(hex_val)
return result
@ -637,17 +661,17 @@ def generate_conversions(script_args, plane_type):
c_source = UnicodeBasicSource(CONVERSIONS_C_SOURCE)
categorizer = UnicodeBasicCategorizer()
unicode_file = os.path.basename(script_args.unicode_data)
spec_casing_file = os.path.basename(script_args.special_casing)
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from %s and %s files. Do not edit! */" % (unicode_file, spec_casing_file),
" * from %s and %s files. Do not edit! */" % (UNICODE_DATA_FILE, SPECIAL_CASING_FILE),
""]
c_source.complete_header("\n".join(header_completion))
unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
# Read the corresponding unicode values of lower and upper case letters and store these in tables
lower_case, upper_case = categorizer.read_case_mappings(script_args.unicode_data, script_args.special_casing)
lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
c_source.add_conversion_range("character_case",
extract_ranges(lower_case, upper_case),
@ -702,34 +726,76 @@ def generate_conversions(script_args, plane_type):
c_source.generate()
def generate_folding(script_args, plane_type):
if plane_type == UNICODE_PLANE_TYPE_SUPPLEMENTARY:
c_source = UnicodeSupplementarySource(FOLDING_SUP_C_SOURCE)
categorizer = UnicodeSupplementaryCategorizer()
else:
c_source = UnicodeBasicSource(FOLDING_C_SOURCE)
categorizer = UnicodeBasicCategorizer()
header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
" * from the %s file. Do not edit! */" % (CASE_FOLDING_FILE),
""]
c_source.complete_header("\n".join(header_completion))
unicode_data_path = os.path.join(script_args.unicode_dir, UNICODE_DATA_FILE)
special_casing_path = os.path.join(script_args.unicode_dir, SPECIAL_CASING_FILE)
case_folding_path = os.path.join(script_args.unicode_dir, CASE_FOLDING_FILE)
# Read the corresponding unicode values of lower and upper case letters and store these in tables
lower_case, upper_case = categorizer.read_case_mappings(unicode_data_path, special_casing_path)
folding = {}
with open(case_folding_path, 'r') as case_folding:
case_folding_re = re.compile(r'(?P<code_point>[^;]*);\s*(?P<type>[^;]*);\s*(?P<folding>[^;]*);')
for line in case_folding:
match = case_folding_re.match(line)
if match and match.group('type') in ('S', 'C'):
code_point = int(match.group('code_point'), 16)
if categorizer.in_range(code_point):
folding[code_point] = parse_unicode_sequence(match.group('folding'))
should_to_upper = []
should_skip_to_lower = []
for code_point in lower_case:
if code_point not in folding:
should_skip_to_lower.append(code_point)
for code_point, folded in folding.items():
if lower_case.get(code_point, make_char(code_point)) != folded:
should_to_upper.append(code_point)
if upper_case.get(code_point, '') == folded:
should_skip_to_lower.append(code_point)
c_source.add_range('folding_skip_to_lower', categorizer.create_tables(should_skip_to_lower))
c_source.add_range('folding_to_upper', categorizer.create_tables(should_to_upper))
c_source.generate()
# entry point
def main():
parser = argparse.ArgumentParser(description='lit-unicode-{conversions,ranges}-{sup}.inc.h generator',
epilog='''
The input files:
- UnicodeData.txt
- SpecialCasing.txt
- DerivedCoreProperties.txt
- PropList.txt
must be retrieved from
http://www.unicode.org/Public/<VERSION>/ucd/.
The input data must be retrieved from
http://www.unicode.org/Public/<VERSION>/ucd/UCD.zip.
The last known good version is 13.0.0.
''')
def check_file(path):
if not os.path.isfile(path) or not os.access(path, os.R_OK):
raise argparse.ArgumentTypeError('The %s file is missing or not readable!' % path)
def check_dir(path):
if not os.path.isdir(path) or not os.access(path, os.R_OK):
raise argparse.ArgumentTypeError('The %s directory does not exist or is not readable!' % path)
return path
parser.add_argument('--unicode-data', metavar='FILE', action='store', required=True,
type=check_file, help='specify the unicode data file')
parser.add_argument('--special-casing', metavar='FILE', action='store', required=True,
type=check_file, help='specify the special casing file')
parser.add_argument('--prop-list', metavar='FILE', action='store', required=True,
type=check_file, help='specify the prop list file')
parser.add_argument('--derived-core-properties', metavar='FILE', action='store', required=True,
type=check_file, help='specify the DerivedCodeProperties file')
parser.add_argument('--unicode-dir', metavar='DIR', action='store', required=True,
type=check_dir, help='specify the unicode data directory')
script_args = parser.parse_args()
@ -737,6 +803,9 @@ def main():
generate_ranges(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
generate_conversions(script_args, UNICODE_PLANE_TYPE_BASIC)
generate_conversions(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
generate_folding(script_args, UNICODE_PLANE_TYPE_BASIC)
# There are currently no code points in the supplementary planes that require special folding
# generate_folding(script_args, UNICODE_PLANE_TYPE_SUPPLEMENTARY)
if __name__ == "__main__":

View File

@ -310,7 +310,7 @@ max-args=6
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
max-locals=20
# Maximum number of return / yield for function / method body
max-returns=6