#!/usr/bin/env python # Copyright JS Foundation and other contributors, http://js.foundation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt # # unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So # letter: Lu Ll Lt Lm Lo Nl # non-letter-indent-part: # digit: Nd # punctuation mark: Mn Mc # connector punctuation: Pc # separators: Zs from __future__ import print_function from settings import PROJECT_DIR from unicode_c_source import Source import argparse import bisect import csv import itertools import os import sys RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h') def main(): parser = argparse.ArgumentParser() parser.add_argument('unicode_data', metavar='FILE', action='store', help='specify the unicode data file') parser.add_argument('--c-source', metavar='FILE', action='store', default=RANGES_C_SOURCE, help='specify the output c source (default: %(default)s)') script_args = parser.parse_args() if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK): print('The %s file is missing or not readable!' % script_args.unicode_data) sys.exit(1) letters, non_letters, separators = read_categories(script_args.unicode_data) letter_tables = split_list(list(ranges(letters))) non_letter_tables = split_list(list(ranges(non_letters))) separator_tables = split_list(list(ranges(separators))) c_source = Source(script_args.c_source) header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__), " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data), ""] c_source.complete_header("\n".join(header_completion)) c_source.add_table(letter_tables[0], "unicode_letter_interval_sps", "uint16_t", ("/**\n" " * Character interval starting points for the unicode letters.\n" " *\n" " * The characters covered by these intervals are from\n" " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" " */")) c_source.add_table(letter_tables[1], "unicode_letter_interval_lengths", "uint8_t", ("/**\n" " * Character lengths for the unicode letters.\n" " *\n" " * The characters covered by these intervals are from\n" " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n" " */")) c_source.add_table(letter_tables[2], "unicode_letter_chars", "uint16_t", ("/**\n" " * Those unicode letter characters that are not inside any of\n" " * the intervals specified in jerry_unicode_letter_interval_sps array.\n" " *\n" " * The characters are from the following Unicode categories:\n" " * Lu, Ll, Lt, Lm, Lo, Nl\n" " */")) c_source.add_table(non_letter_tables[0], "unicode_non_letter_ident_part_interval_sps", "uint16_t", ("/**\n" " * Character interval starting points for non-letter character\n" " * that can be used as a non-first character of an identifier.\n" " *\n" " * The characters covered by these intervals are from\n" " * the following Unicode categories: Nd, Mn, Mc, Pc\n" " */")) c_source.add_table(non_letter_tables[1], "unicode_non_letter_ident_part_interval_lengths", "uint8_t", ("/**\n" " * Character interval lengths for non-letter character\n" " * that can be used as a non-first character of an identifier.\n" " *\n" " * The characters covered by these intervals are from\n" " * the following Unicode categories: Nd, Mn, Mc, Pc\n" " */")) c_source.add_table(non_letter_tables[2], "unicode_non_letter_ident_part_chars", "uint16_t", ("/**\n" " * Those non-letter characters that can be used as a non-first\n" " * character of an identifier and not included in any of the intervals\n" " * specified in jerry_unicode_non_letter_ident_part_interval_sps array.\n" " *\n" " * The characters are from the following Unicode categories:\n" " * Nd, Mn, Mc, Pc\n" " */")) c_source.add_table(separator_tables[0], "unicode_separator_char_interval_sps", "uint16_t", ("/**\n" " * Unicode separator character interval starting points from Unicode category: Zs\n" " */")) c_source.add_table(separator_tables[1], "unicode_separator_char_interval_lengths", "uint8_t", ("/**\n" " * Unicode separator character interval lengths from Unicode category: Zs\n" " */")) c_source.add_table(separator_tables[2], "unicode_separator_chars", "uint16_t", ("/**\n" " * Unicode separator characters that are not in the\n" " * jerry_unicode_separator_char_intervals array.\n" " *\n" " * Unicode category: Zs\n" " */")) c_source.generate() def read_categories(unicode_data_file): """ Read the corresponding unicode values and store them in category lists. :return: List of letters, non_letter and separators. """ letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"] non_letter_category = ["Nd", "Mn", "Mc", "Pc"] separator_category = ["Zs"] letters = [] non_letters = [] separators = [] with open(unicode_data_file) as unicode_data: unicode_data_reader = csv.reader(unicode_data, delimiter=';') for line in unicode_data_reader: unicode_id = int(line[0], 16) # Skip supplementary planes and ascii chars if unicode_id >= 0x10000 or unicode_id < 128: continue category = line[2] if category in letter_category: letters.append(unicode_id) elif category in non_letter_category: non_letters.append(unicode_id) elif category in separator_category: separators.append(unicode_id) # This separator char is handled separatly non_breaking_space = 0x00A0 if non_breaking_space in separators: separators.remove(int(non_breaking_space)) # These separator chars are not in the unicode data file or not in Zs category mongolian_vowel_separator = 0x180E medium_mathematical_space = 0x205F zero_width_space = 0x200B if mongolian_vowel_separator not in separators: bisect.insort(separators, int(mongolian_vowel_separator)) if medium_mathematical_space not in separators: bisect.insort(separators, int(medium_mathematical_space)) if zero_width_space not in separators: bisect.insort(separators, int(zero_width_space)) return letters, non_letters, separators def ranges(i): """ Convert an increasing list of integers into a range list :return: List of ranges. """ for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])): group = list(group) yield group[0][1], group[-1][1] def split_list(category_list): """ Split list of ranges into intervals and single char lists. :return: List of interval starting points, interval lengths and single chars """ interval_sps = [] interval_lengths = [] chars = [] for element in category_list: interval_length = element[1] - element[0] if interval_length == 0: chars.append(element[0]) elif interval_length > 255: for i in range(element[0], element[1], 256): length = 255 if (element[1] - i > 255) else (element[1] - i) interval_sps.append(i) interval_lengths.append(length) else: interval_sps.append(element[0]) interval_lengths.append(element[1] - element[0]) return interval_sps, interval_lengths, chars if __name__ == "__main__": main()