jerryscript/tools/print-unicode-ranges.sh
Ruben Ayrapetyan 7d53133fcb Add unicode characters ranges, listed per category.
JerryScript-DCO-1.0-Signed-off-by: Ruben Ayrapetyan r.ayrapetyan@samsung.com
2015-07-02 17:37:21 +03:00

141 lines
4.3 KiB
Bash
Executable File

#!/bin/bash
# Copyright 2015 Samsung Electronics Co., Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
#
UNICODE_DATA_PATH="$1"
#
# One of unicode character category names (Lu, Ll, Nl, etc.)
#
UNICODE_CHAR_CATEGORY="$2"
UNICODE_CHAR_CATEGORY_UPPER_CASE=`echo $UNICODE_CHAR_CATEGORY | tr '[:lower:]' '[:upper:]'`
#
# 1. Print character codes, categories, and names
# 2. Filter by category
# 3. Print character codes and names without categories
# 4. Sort
# 5. Add '0x' to each line
# 6. Combine hexadecimal numbers into named ranges
# 7. Print ranges in format "LIT_UNICODE_RANGE_$UNICODE_CHAR_CATEGORY_UPPER_CASE (range_begin, range_end) /* range name */"
#
cut -d ';' "$UNICODE_DATA_PATH" -f 1,2,3 \
| grep ";$UNICODE_CHAR_CATEGORY\$" \
| cut -d ';' -f 1,2 \
| sort \
| awk 'BEGIN { FS=";"; OFS=";" } { print "0x"$1, $2; }' \
| awk --non-decimal-data \
'BEGIN \
{ \
FS=";"; \
OFS=";"; \
is_in_range=0; \
} \
\
function output_next_range () \
{ \
if (range_begin == range_prev) \
{ \
print range_begin, range_prev, range_begin_name; \
} \
else \
{ \
print range_begin, range_prev, range_begin_name, range_prev_name; \
} \
} \
\
{ \
if (is_in_range == 0) \
{ \
is_in_range=1; \
range_begin=$1; \
range_prev=$1; \
range_begin_name=$2; \
range_prev_name=$2; \
} \
else \
{ \
if (range_prev + 1 == $1) \
{ \
range_prev=$1; \
range_prev_name=$2
} \
else \
{ \
output_next_range(); \
range_begin=$1; \
range_prev=$1; \
range_begin_name=$2; \
range_prev_name=$2; \
} \
} \
} \
\
END \
{ \
output_next_range(); \
}' \
| awk \
'BEGIN \
{ \
FS = ";" \
} \
{ \
range_string = sprintf ("LIT_UNICODE_RANGE_'$UNICODE_CHAR_CATEGORY_UPPER_CASE' (%s, %s)", $1, $2); \
range_string_length = length (range_string); \
\
range_begin_name=$3; \
range_end_name=$4; \
\
range_begin_name_length = length (range_begin_name); \
range_end_name_length = length (range_end_name); \
\
printf "%s", range_string; \
if (range_end_name_length == 0) \
{ \
printf " /* %s */\n", range_begin_name; \
} \
else \
{ \
if (range_begin_name_length > range_end_name_length) \
{ \
indent1 = 0; \
indent2 = range_string_length + range_begin_name_length / 2;
indent3 = range_string_length + (range_begin_name_length - range_end_name_length) / 2; \
} \
else \
{ \
indent1 = (range_end_name_length - range_begin_name_length) / 2; \
indent2 = range_string_length + range_end_name_length / 2;
indent3 = range_string_length; \
} \
indent3 = indent3 + 3; \
fmt1 = sprintf (" /* %%%ds%%s\n", indent1); \
fmt2 = sprintf (" %%%ds<--->\n", indent2); \
fmt3 = sprintf (" %%%ds%%s */\n", indent3); \
\
printf fmt1, "", $3; \
printf fmt2, ""; \
printf fmt3, "", $4; \
} \
\
printf "\n"; \
}'