Improve gen-unicode.py to support unicode ranges (#2944)

This patch fixes #2936

Co-authored-by: Gabor Loki loki@inf.u-szeged.hu
JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik frobert@inf.u-szeged.hu
This commit is contained in:
Robert Fancsik
2019-07-05 15:44:56 +02:00
committed by Dániel Bátyai
parent e902b870aa
commit 2b8c428694
3 changed files with 133 additions and 66 deletions
+48 -18
View File
@@ -47,14 +47,29 @@ static const uint16_t lit_unicode_letter_interval_sps[] JERRY_ATTR_CONST_DATA =
0x2145, 0x2160, 0x2c00, 0x2c30, 0x2c60, 0x2ceb, 0x2cf2, 0x2d00, 0x2d30, 0x2d80,
0x2da0, 0x2da8, 0x2db0, 0x2db8, 0x2dc0, 0x2dc8, 0x2dd0, 0x2dd8, 0x3005, 0x3021,
0x3031, 0x3038, 0x3041, 0x309d, 0x30a1, 0x30fc, 0x3105, 0x3131, 0x31a0, 0x31f0,
0xa000, 0xa100, 0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a,
0xa640, 0xa67f, 0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807,
0xa80c, 0xa840, 0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6,
0xa9fa, 0xaa00, 0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0,
0xaaf2, 0xab01, 0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xd7b0,
0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a, 0xfb38, 0xfb40,
0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70, 0xfe76, 0xff21,
0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
0x3400, 0x3500, 0x3600, 0x3700, 0x3800, 0x3900, 0x3a00, 0x3b00, 0x3c00, 0x3d00,
0x3e00, 0x3f00, 0x4000, 0x4100, 0x4200, 0x4300, 0x4400, 0x4500, 0x4600, 0x4700,
0x4800, 0x4900, 0x4a00, 0x4b00, 0x4c00, 0x4d00, 0x4e00, 0x4f00, 0x5000, 0x5100,
0x5200, 0x5300, 0x5400, 0x5500, 0x5600, 0x5700, 0x5800, 0x5900, 0x5a00, 0x5b00,
0x5c00, 0x5d00, 0x5e00, 0x5f00, 0x6000, 0x6100, 0x6200, 0x6300, 0x6400, 0x6500,
0x6600, 0x6700, 0x6800, 0x6900, 0x6a00, 0x6b00, 0x6c00, 0x6d00, 0x6e00, 0x6f00,
0x7000, 0x7100, 0x7200, 0x7300, 0x7400, 0x7500, 0x7600, 0x7700, 0x7800, 0x7900,
0x7a00, 0x7b00, 0x7c00, 0x7d00, 0x7e00, 0x7f00, 0x8000, 0x8100, 0x8200, 0x8300,
0x8400, 0x8500, 0x8600, 0x8700, 0x8800, 0x8900, 0x8a00, 0x8b00, 0x8c00, 0x8d00,
0x8e00, 0x8f00, 0x9000, 0x9100, 0x9200, 0x9300, 0x9400, 0x9500, 0x9600, 0x9700,
0x9800, 0x9900, 0x9a00, 0x9b00, 0x9c00, 0x9d00, 0x9e00, 0x9f00, 0xa000, 0xa100,
0xa200, 0xa300, 0xa400, 0xa4d0, 0xa500, 0xa600, 0xa610, 0xa62a, 0xa640, 0xa67f,
0xa6a0, 0xa717, 0xa722, 0xa78b, 0xa7b0, 0xa7f7, 0xa803, 0xa807, 0xa80c, 0xa840,
0xa882, 0xa8f2, 0xa90a, 0xa930, 0xa960, 0xa984, 0xa9e0, 0xa9e6, 0xa9fa, 0xaa00,
0xaa40, 0xaa44, 0xaa60, 0xaa7e, 0xaab5, 0xaab9, 0xaadb, 0xaae0, 0xaaf2, 0xab01,
0xab09, 0xab11, 0xab20, 0xab28, 0xab30, 0xab5c, 0xab70, 0xac00, 0xad00, 0xae00,
0xaf00, 0xb000, 0xb100, 0xb200, 0xb300, 0xb400, 0xb500, 0xb600, 0xb700, 0xb800,
0xb900, 0xba00, 0xbb00, 0xbc00, 0xbd00, 0xbe00, 0xbf00, 0xc000, 0xc100, 0xc200,
0xc300, 0xc400, 0xc500, 0xc600, 0xc700, 0xc800, 0xc900, 0xca00, 0xcb00, 0xcc00,
0xcd00, 0xce00, 0xcf00, 0xd000, 0xd100, 0xd200, 0xd300, 0xd400, 0xd500, 0xd600,
0xd700, 0xd7b0, 0xd7cb, 0xf900, 0xfa00, 0xfa70, 0xfb00, 0xfb13, 0xfb1f, 0xfb2a,
0xfb38, 0xfb40, 0xfb43, 0xfb46, 0xfbd3, 0xfcd3, 0xfd50, 0xfd92, 0xfdf0, 0xfe70,
0xfe76, 0xff21, 0xff41, 0xff66, 0xffc2, 0xffca, 0xffd2, 0xffda
};
/**
@@ -88,14 +103,29 @@ static const uint8_t lit_unicode_letter_interval_lengths[] JERRY_ATTR_CONST_DATA
0x0004, 0x0028, 0x002e, 0x002e, 0x0084, 0x0003, 0x0001, 0x0025, 0x0037, 0x0016,
0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0006, 0x0002, 0x0008,
0x0004, 0x0004, 0x0055, 0x0002, 0x0059, 0x0003, 0x0028, 0x005d, 0x001a, 0x000f,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001,
0x002e, 0x001e, 0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003,
0x0016, 0x0033, 0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009,
0x0004, 0x0028, 0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a,
0x0002, 0x0005, 0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x0016,
0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c, 0x0004, 0x0001,
0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004, 0x0086, 0x0019,
0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00b5, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00d5, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x008c, 0x002d, 0x00ff, 0x000c, 0x000f, 0x0001, 0x002e, 0x001e,
0x004f, 0x0008, 0x0066, 0x0023, 0x0007, 0x000a, 0x0002, 0x0003, 0x0016, 0x0033,
0x0031, 0x0005, 0x001b, 0x0016, 0x001c, 0x002e, 0x0004, 0x0009, 0x0004, 0x0028,
0x0002, 0x0007, 0x0016, 0x0031, 0x0001, 0x0004, 0x0002, 0x000a, 0x0002, 0x0005,
0x0005, 0x0005, 0x0006, 0x0006, 0x002a, 0x0009, 0x0072, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff,
0x00a3, 0x0016, 0x0030, 0x00ff, 0x006d, 0x0069, 0x0006, 0x0004, 0x0009, 0x000c,
0x0004, 0x0001, 0x0001, 0x006b, 0x00ff, 0x006a, 0x003f, 0x0035, 0x000b, 0x0004,
0x0086, 0x0019, 0x0019, 0x0058, 0x0005, 0x0005, 0x0005, 0x0002
};
/**
@@ -114,8 +144,8 @@ static const uint16_t lit_unicode_letter_chars[] JERRY_ATTR_CONST_DATA =
0x0e8d, 0x0ea5, 0x0ea7, 0x0ebd, 0x0ec6, 0x0f00, 0x103f, 0x1061, 0x108e, 0x10c7,
0x10cd, 0x1258, 0x12c0, 0x17d7, 0x17dc, 0x18aa, 0x1aa7, 0x1f59, 0x1f5b, 0x1f5d,
0x1fbe, 0x2071, 0x207f, 0x2102, 0x2107, 0x2115, 0x2124, 0x2126, 0x2128, 0x214e,
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4db5, 0x4e00, 0x9fd5, 0xa8fb, 0xa8fd,
0xa9cf, 0xaa7a, 0xaab1, 0xaac0, 0xaac2, 0xac00, 0xd7a3, 0xfb1d, 0xfb3e
0x2d27, 0x2d2d, 0x2d6f, 0x2e2f, 0x3400, 0x4e00, 0xa8fb, 0xa8fd, 0xa9cf, 0xaa7a,
0xaab1, 0xaac0, 0xaac2, 0xac00, 0xfb1d, 0xfb3e
};
/**
+16
View File
@@ -0,0 +1,16 @@
// Copyright JS Foundation and other contributors, http://js.foundation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
var 测试 = "您好";
assert(测试 === "您好");
+47 -26
View File
@@ -57,31 +57,44 @@ class UniCodeSource(object):
generated_source.write("\n".join(self.__header))
generated_source.write("\n".join(self.__data))
# functions for unicode ranges
def read_categories(unicode_data_file):
"""
Read the corresponding unicode values and store them in category lists.
:return: List of letters, non_letter and separators.
"""
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
class UnicodeCategorizer(object):
def __init__(self):
# unicode categories: Lu Ll Lt Mn Mc Me Nd Nl No Zs Zl Zp Cc Cf Cs
# Co Lm Lo Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
# letter: Lu Ll Lt Lm Lo Nl
# non-letter-indent-part:
# digit: Nd
# punctuation mark: Mn Mc
# connector punctuation: Pc
# separators: Zs
letter_category = ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"]
non_letter_category = ["Nd", "Mn", "Mc", "Pc"]
separator_category = ["Zs"]
self._unicode_categories = {
'letters_category' : ["Lu", "Ll", "Lt", "Lm", "Lo", "Nl"],
'non_letters_category' : ["Nd", "Mn", "Mc", "Pc"],
'separators_category' : ["Zs"]
}
letters = []
non_letters = []
separators = []
self._categories = {
'letters' : [],
'non_letters' : [],
'separators' : []
}
def _store_by_category(self, unicode_id, category):
"""
Store the given unicode_id by its category
"""
for target_category in self._categories:
if category in self._unicode_categories[target_category + '_category']:
self._categories[target_category].append(unicode_id)
def read_categories(self, unicode_data_file):
"""
Read the corresponding unicode values and store them in category lists.
:return: List of letters, non_letter and separators.
"""
range_start_id = 0
with open(unicode_data_file) as unicode_data:
for line in csv.reader(unicode_data, delimiter=';'):
@@ -93,14 +106,21 @@ def read_categories(unicode_data_file):
category = line[2]
if category in letter_category:
letters.append(unicode_id)
elif category in non_letter_category:
non_letters.append(unicode_id)
elif category in separator_category:
separators.append(unicode_id)
if range_start_id != 0:
while range_start_id <= unicode_id:
self._store_by_category(range_start_id, category)
range_start_id += 1
range_start_id = 0
continue
if line[1].startswith('<'):
# Save the start position of the range
range_start_id = unicode_id
self._store_by_category(unicode_id, category)
# This separator char is handled separatly
separators = self._categories['separators']
non_breaking_space = 0x00A0
if non_breaking_space in separators:
separators.remove(int(non_breaking_space))
@@ -117,7 +137,7 @@ def read_categories(unicode_data_file):
if zero_width_space not in separators:
bisect.insort(separators, int(zero_width_space))
return letters, non_letters, separators
return self._categories['letters'], self._categories['non_letters'], self._categories['separators']
def group_ranges(i):
@@ -159,7 +179,8 @@ def split_list(category_list):
def generate_ranges(script_args):
letters, non_letters, separators = read_categories(script_args.unicode_data)
categorizer = UnicodeCategorizer()
letters, non_letters, separators = categorizer.read_categories(script_args.unicode_data)
letter_tables = split_list(list(group_ranges(letters)))
non_letter_tables = split_list(list(group_ranges(non_letters)))