jerryscript/tools/print-unicode-ranges.sh

#!/bin/bash

# Copyright 2015 Samsung Electronics Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
#
UNICODE_DATA_PATH="$1"

#
# One of unicode character category names (Lu, Ll, Nl, etc.)
#
UNICODE_CHAR_CATEGORY="$2"

UNICODE_CHAR_CATEGORY_UPPER_CASE=`echo $UNICODE_CHAR_CATEGORY | tr '[:lower:]' '[:upper:]'`

#
# 1. Print character codes, categories, and names
# 2. Filter by category
# 3. Print character codes and names without categories
# 4. Sort
# 5. Add '0x' to each line
# 6. Combine hexadecimal numbers into named ranges
# 7. Print ranges in format "LIT_UNICODE_RANGE_$UNICODE_CHAR_CATEGORY_UPPER_CASE (range_begin, range_end) /* range name */"
#

cut -d ';' "$UNICODE_DATA_PATH" -f 1,2,3 \
      | grep ";$UNICODE_CHAR_CATEGORY\$" \
      | cut -d ';' -f 1,2 \
      | sort \
      | awk 'BEGIN { FS=";"; OFS=";" } { print "0x"$1, $2; }' \
      | awk --non-decimal-data \
        'BEGIN \
         { \
           FS=";"; \
           OFS=";"; \
           is_in_range=0; \
         } \
         \
         function output_next_range () \
         { \
           if (range_begin == range_prev) \
           { \
             print range_begin, range_prev, range_begin_name; \
           } \
           else \
           { \
             print range_begin, range_prev,  range_begin_name, range_prev_name; \
           } \
         } \
         \
         { \
           if (is_in_range == 0) \
           { \
             is_in_range=1; \
             range_begin=$1; \
             range_prev=$1; \
             range_begin_name=$2; \
             range_prev_name=$2; \
           } \
           else \
           { \
             if (range_prev + 1 == $1) \
             { \
               range_prev=$1; \
               range_prev_name=$2
             } \
             else \
             { \
               output_next_range(); \
               range_begin=$1; \
               range_prev=$1; \
               range_begin_name=$2; \
               range_prev_name=$2; \
             } \
           } \
         } \
         \
         END \
         { \
           output_next_range(); \
         }' \
         | awk \
          'BEGIN \
           { \
             FS = ";" \
           } \
           { \
             range_string = sprintf ("LIT_UNICODE_RANGE_'$UNICODE_CHAR_CATEGORY_UPPER_CASE' (%s, %s)", $1, $2); \
             range_string_length = length (range_string); \
             \
             range_begin_name=$3; \
             range_end_name=$4; \
             \
             range_begin_name_length = length (range_begin_name); \
             range_end_name_length = length (range_end_name); \
             \
             printf "%s", range_string; \
             if (range_end_name_length == 0) \
             { \
               printf " /* %s */\n", range_begin_name; \
             } \
             else \
             { \
               if (range_begin_name_length > range_end_name_length) \
               { \
                 indent1 = 0; \
                 indent2 = range_string_length + range_begin_name_length / 2;
                 indent3 = range_string_length + (range_begin_name_length - range_end_name_length) / 2; \
               } \
               else \
               { \
                 indent1 = (range_end_name_length - range_begin_name_length) / 2; \
                 indent2 = range_string_length + range_end_name_length / 2;
                 indent3 = range_string_length; \
               } \
               indent3 = indent3 + 3; \
               fmt1 = sprintf (" /* %%%ds%%s\n", indent1); \
               fmt2 = sprintf (" %%%ds<--->\n", indent2); \
               fmt3 = sprintf (" %%%ds%%s */\n", indent3); \
               \
               printf fmt1, "", $3; \
               printf fmt2, ""; \
               printf fmt3, "", $4; \
             } \
             \
             printf "\n"; \
           }'