jerryscript/tools/unicode_ranges.py

#!/usr/bin/env python

# Copyright JS Foundation and other contributors, http://js.foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
#

# unicode categories:      Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
# letter:                  Lu Ll Lt Lm Lo Nl
# non-letter-indent-part:
#   digit:                 Nd
#   punctuation mark:      Mn Mc
#   connector punctuation: Pc
# separators:              Zs

from __future__ import print_function
from settings import PROJECT_DIR
from unicode_c_source import Source

import argparse
import bisect
import csv
import itertools
import os
import sys

RANGES_C_SOURCE = os.path.join(PROJECT_DIR, 'jerry-core/lit/lit-unicode-ranges.inc.h')

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('unicode_data',
                        metavar='FILE',
                        action='store',
                        help='specify the unicode data file')

    parser.add_argument('--c-source',
                        metavar='FILE',
                        action='store',
                        default=RANGES_C_SOURCE,
                        help='specify the output c source (default: %(default)s)')

    script_args = parser.parse_args()

    if not os.path.isfile(script_args.unicode_data) or not os.access(script_args.unicode_data, os.R_OK):
        print('The %s file is missing or not readable!' % script_args.unicode_data)
        sys.exit(1)

    letters, non_letters, separators = read_categories(script_args.unicode_data)

    letter_tables = split_list(list(ranges(letters)))
    non_letter_tables = split_list(list(ranges(non_letters)))
    separator_tables = split_list(list(ranges(separators)))

    c_source = Source(script_args.c_source)

    header_completion = ["/* This file is automatically generated by the %s script" % os.path.basename(__file__),
                         " * from %s. Do not edit! */" % os.path.basename(script_args.unicode_data),
                         ""]

    c_source.complete_header("\n".join(header_completion))

    c_source.add_table(letter_tables[0],
                       "unicode_letter_interval_sps",
                       "uint16_t",
                       ("/**\n"
                        " * Character interval starting points for the unicode letters.\n"
                        " *\n"
                        " * The characters covered by these intervals are from\n"
                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
                        " */"))

    c_source.add_table(letter_tables[1],
                       "unicode_letter_interval_lengths",
                       "uint8_t",
                       ("/**\n"
                        " * Character lengths for the unicode letters.\n"
                        " *\n"
                        " * The characters covered by these intervals are from\n"
                        " * the following Unicode categories: Lu, Ll, Lt, Lm, Lo, Nl\n"
                        " */"))

    c_source.add_table(letter_tables[2],
                       "unicode_letter_chars",
                       "uint16_t",
                       ("/**\n"
                        " * Those unicode letter characters that are not inside any of\n"
                        " * the intervals specified in jerry_unicode_letter_interval_sps array.\n"
                        " *\n"
                        " * The characters are from the following Unicode categories:\n"
                        " * Lu, Ll, Lt, Lm, Lo, Nl\n"
                        " */"))

    c_source.add_table(non_letter_tables[0],
                       "unicode_non_letter_ident_part_interval_sps",
                       "uint16_t",
                       ("/**\n"
                        " * Character interval starting points for non-letter character\n"
                        " * that can be used as a non-first character of an identifier.\n"
                        " *\n"
                        " * The characters covered by these intervals are from\n"
                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
                        " */"))

    c_source.add_table(non_letter_tables[1],
                       "unicode_non_letter_ident_part_interval_lengths",
                       "uint8_t",
                       ("/**\n"
                        " * Character interval lengths for non-letter character\n"
                        " * that can be used as a non-first character of an identifier.\n"
                        " *\n"
                        " * The characters covered by these intervals are from\n"
                        " * the following Unicode categories: Nd, Mn, Mc, Pc\n"
                        " */"))

    c_source.add_table(non_letter_tables[2],
                       "unicode_non_letter_ident_part_chars",
                       "uint16_t",
                       ("/**\n"
                        " * Those non-letter characters that can be used as a non-first\n"
                        " * character of an identifier and not included in any of the intervals\n"
                        " * specified in jerry_unicode_non_letter_ident_part_interval_sps array.\n"
                        " *\n"
                        " * The characters are from the following Unicode categories:\n"
                        " * Nd, Mn, Mc, Pc\n"
                        " */"))

    c_source.add_table(separator_tables[0],
                       "unicode_separator_char_interval_sps",
                       "uint16_t",
                       ("/**\n"
                        " * Unicode separator character interval starting points from Unicode category: Zs\n"
                        " */"))

    c_source.add_table(separator_tables[1],
                       "unicode_separator_char_interval_lengths",
                       "uint8_t",
                       ("/**\n"
                        " * Unicode separator character interval lengths from Unicode category: Zs\n"
                        " */"))

    c_source.add_table(separator_tables[2],
                       "unicode_separator_chars",
                       "uint16_t",
                       ("/**\n"
                        " * Unicode separator characters that are not in the\n"
                        " * jerry_unicode_separator_char_intervals array.\n"
                        " *\n"
                        " * Unicode category: Zs\n"
                        " */"))

    c_source.generate()


def read_categories(unicode_data_file):
    """
    Read the corresponding unicode values and store them in category lists.

    :return: List of letters, non_letter and separators.
    """

    letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
    non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
    separator_category = ["Zs"]

    letters = []
    non_letters = []
    separators = []

    with open(unicode_data_file) as unicode_data:
        unicode_data_reader = csv.reader(unicode_data, delimiter=';')

        for line in unicode_data_reader:
            unicode_id = int(line[0], 16)

            # Skip supplementary planes and ascii chars
            if unicode_id >= 0x10000 or unicode_id < 128:
                continue

            category = line[2]

            if category in letter_category:
                letters.append(unicode_id)
            elif category in non_letter_category:
                non_letters.append(unicode_id)
            elif category in separator_category:
                separators.append(unicode_id)

    # This separator char is handled separatly
    non_breaking_space = 0x00A0
    if non_breaking_space in separators:
        separators.remove(int(non_breaking_space))

    # These separator chars are not in UnicodeData-3.0.0.txt or not in Zs category
    mongolian_vowel_separator = 0x180E
    medium_mathematical_space = 0x205F

    if mongolian_vowel_separator not in separators:
        bisect.insort(separators, int(mongolian_vowel_separator))
    if medium_mathematical_space not in separators:
        bisect.insort(separators, int(medium_mathematical_space))

    return letters, non_letters, separators


def ranges(i):
    """
    Convert an increasing list of integers into a range list

    :return: List of ranges.
    """
    for _, group in itertools.groupby(enumerate(i), lambda q: (q[1] - q[0])):
        group = list(group)
        yield group[0][1], group[-1][1]


def split_list(category_list):
    """
    Split list of ranges into intervals and single char lists.

    :return: List of interval starting points, interval lengths and single chars
    """

    interval_sps = []
    interval_lengths = []
    chars = []

    for element in category_list:
        interval_length = element[1] - element[0]
        if interval_length == 0:
            chars.append(element[0])
        elif interval_length > 255:
            for i in range(element[0], element[1], 256):
                length = 255 if (element[1] - i > 255) else (element[1] - i)
                interval_sps.append(i)
                interval_lengths.append(length)
        else:
            interval_sps.append(element[0])
            interval_lengths.append(element[1] - element[0])

    return interval_sps, interval_lengths, chars


if __name__ == "__main__":
    main()